lecrapaud 0.19.0__py3-none-any.whl → 0.22.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. lecrapaud/__init__.py +22 -1
  2. lecrapaud/{api.py → base.py} +331 -241
  3. lecrapaud/config.py +15 -3
  4. lecrapaud/db/alembic/versions/2025_10_25_0635-07e303521594_add_unique_constraint_to_score.py +39 -0
  5. lecrapaud/db/alembic/versions/2025_10_26_1727-033e0f7eca4f_merge_score_and_model_trainings_into_.py +264 -0
  6. lecrapaud/db/alembic/versions/2025_10_28_2006-0a8fb7826e9b_add_number_of_targets_and_remove_other_.py +75 -0
  7. lecrapaud/db/models/__init__.py +2 -4
  8. lecrapaud/db/models/base.py +116 -65
  9. lecrapaud/db/models/experiment.py +195 -182
  10. lecrapaud/db/models/feature_selection.py +0 -3
  11. lecrapaud/db/models/feature_selection_rank.py +0 -18
  12. lecrapaud/db/models/model_selection.py +2 -2
  13. lecrapaud/db/models/{score.py → model_selection_score.py} +29 -12
  14. lecrapaud/db/session.py +4 -0
  15. lecrapaud/experiment.py +44 -17
  16. lecrapaud/feature_engineering.py +45 -674
  17. lecrapaud/feature_preprocessing.py +1202 -0
  18. lecrapaud/feature_selection.py +145 -332
  19. lecrapaud/integrations/sentry_integration.py +46 -0
  20. lecrapaud/misc/tabpfn_tests.ipynb +2 -2
  21. lecrapaud/mixins.py +247 -0
  22. lecrapaud/model_preprocessing.py +295 -0
  23. lecrapaud/model_selection.py +612 -242
  24. lecrapaud/pipeline.py +548 -0
  25. lecrapaud/search_space.py +2 -1
  26. lecrapaud/utils.py +36 -3
  27. lecrapaud-0.22.6.dist-info/METADATA +423 -0
  28. lecrapaud-0.22.6.dist-info/RECORD +51 -0
  29. {lecrapaud-0.19.0.dist-info → lecrapaud-0.22.6.dist-info}/WHEEL +1 -1
  30. {lecrapaud-0.19.0.dist-info → lecrapaud-0.22.6.dist-info/licenses}/LICENSE +1 -1
  31. lecrapaud/db/models/model_training.py +0 -64
  32. lecrapaud/jobs/__init__.py +0 -13
  33. lecrapaud/jobs/config.py +0 -17
  34. lecrapaud/jobs/scheduler.py +0 -30
  35. lecrapaud/jobs/tasks.py +0 -17
  36. lecrapaud-0.19.0.dist-info/METADATA +0 -249
  37. lecrapaud-0.19.0.dist-info/RECORD +0 -48
@@ -47,30 +47,17 @@ Development
47
47
  import pandas as pd
48
48
  import numpy as np
49
49
  from itertools import product
50
- import joblib
51
- import os
52
-
53
- from sklearn.compose import ColumnTransformer
54
- from sklearn.decomposition import PCA
55
- from sklearn.impute import SimpleImputer
56
- from sklearn.preprocessing import StandardScaler
57
- from sklearn.pipeline import Pipeline
58
- from category_encoders import BinaryEncoder, CountEncoder
59
- from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
60
- from sklearn.model_selection import train_test_split
61
50
 
62
51
  from lecrapaud.integrations.openai_integration import (
63
52
  truncate_text,
64
53
  get_openai_embeddings,
65
54
  )
66
- from lecrapaud.feature_selection import get_features_by_types
67
55
  from lecrapaud.utils import logger
68
- from lecrapaud.db import Target, Feature, Experiment
69
- from lecrapaud.config import PYTHON_ENV
56
+ from lecrapaud.mixins import LeCrapaudEstimatorMixin
70
57
 
71
58
 
72
59
  # main function
73
- class FeatureEngineeringEngine:
60
+ class FeatureEngineering(LeCrapaudEstimatorMixin):
74
61
  """
75
62
  Feature engineering pipeline
76
63
 
@@ -86,24 +73,39 @@ class FeatureEngineeringEngine:
86
73
 
87
74
  def __init__(
88
75
  self,
89
- data: pd.DataFrame,
90
- columns_drop: list[str] = [],
91
- columns_boolean: list[str] = [],
92
- columns_date: list[str] = [],
93
- columns_te_groupby: list[str] = [],
94
- columns_te_target: list[str] = [],
76
+ experiment=None,
95
77
  for_training: bool = True,
96
78
  **kwargs,
97
79
  ):
98
- self.data = data
99
- self.columns_drop = columns_drop
100
- self.columns_boolean = columns_boolean
101
- self.columns_date = columns_date
102
- self.columns_te_groupby = columns_te_groupby
103
- self.columns_te_target = columns_te_target
104
- self.for_training = for_training
105
-
106
- def run(self) -> pd.DataFrame:
80
+ # The mixin will automatically set all experiment.context parameters as attributes
81
+ # and kwargs will override them if provided
82
+ super().__init__(experiment=experiment, for_training=for_training, **kwargs)
83
+
84
+ # Set defaults for required parameters if not provided
85
+ if not hasattr(self, 'columns_drop'):
86
+ self.columns_drop = []
87
+ if not hasattr(self, 'columns_boolean'):
88
+ self.columns_boolean = []
89
+ if not hasattr(self, 'columns_date'):
90
+ self.columns_date = []
91
+ if not hasattr(self, 'columns_te_groupby'):
92
+ self.columns_te_groupby = []
93
+ if not hasattr(self, 'columns_te_target'):
94
+ self.columns_te_target = []
95
+
96
+ def fit(self, X, y=None):
97
+ """
98
+ Fit the feature engineering estimator.
99
+
100
+ Args:
101
+ X (pd.DataFrame): Input data
102
+ y: Target values (ignored)
103
+
104
+ Returns:
105
+ Transformed data (for compatibility with existing code)
106
+ """
107
+ self.data = X.copy()
108
+
107
109
  # drop columns
108
110
  self.data = self.data.drop(columns=self.columns_drop, errors="ignore")
109
111
 
@@ -126,6 +128,17 @@ class FeatureEngineeringEngine:
126
128
  # Cyclic encode dates
127
129
  self.data = self.cyclic_encode_date()
128
130
 
131
+ self._set_fitted()
132
+ return self
133
+
134
+ def get_data(self):
135
+ """
136
+ Get the transformed data after feature engineering.
137
+
138
+ Returns:
139
+ pd.DataFrame: The transformed data with engineered features
140
+ """
141
+ self._check_is_fitted()
129
142
  return self.data
130
143
 
131
144
  def cyclic_encode_date(self) -> pd.DataFrame:
@@ -220,7 +233,7 @@ class FeatureEngineeringEngine:
220
233
  Returns:
221
234
  pd.DataFrame: Original dataframe with new encoded columns added
222
235
  """
223
- # TODO: target encoding needs to be fit / transform based at inference time.
236
+ # TODO: target encoding needs to be fit / transform based at transform time.
224
237
  df: pd.DataFrame = self.data
225
238
  columns_te_groupby: list[list[str]] = self.columns_te_groupby
226
239
  columns_te_target: list[str] = self.columns_te_target
@@ -299,7 +312,7 @@ class FeatureEngineeringEngine:
299
312
  non_numeric_cols = [col for col in missing_cols if col not in numeric_cols]
300
313
 
301
314
  logger.warning(
302
- f"Missing values found in inference data."
315
+ f"Missing values found in transform data."
303
316
  f"Filling with 0 for numeric columns: {numeric_cols}, "
304
317
  f"and 'unknown' for non-numeric columns: {non_numeric_cols}"
305
318
  )
@@ -310,649 +323,7 @@ class FeatureEngineeringEngine:
310
323
  return df
311
324
 
312
325
 
313
- class PreprocessFeature:
314
-
315
- def __init__(
316
- self,
317
- data: pd.DataFrame,
318
- experiment,
319
- time_series: bool = False,
320
- date_column: str | None = None,
321
- group_column: str | None = None,
322
- val_size: float = 0.2,
323
- test_size: float = 0.2,
324
- columns_pca: list[str] = [],
325
- pca_temporal: list[dict[str, list[str]]] = [],
326
- pca_cross_sectional: list[dict[str, list[str]]] = [],
327
- columns_onehot: list[str] = [],
328
- columns_binary: list[str] = [],
329
- columns_ordinal: list[str] = [],
330
- columns_frequency: list[str] = [],
331
- target_numbers: list = [],
332
- target_clf: list = [],
333
- **kwargs,
334
- ):
335
- self.data = data
336
- self.data.columns = self.data.columns.str.upper()
337
-
338
- self.experiment = experiment
339
- self.columns_pca = [col.upper() for col in columns_pca]
340
- self.pca_temporal = pca_temporal
341
- self.pca_cross_sectional = pca_cross_sectional
342
- self.columns_onehot = [col.upper() for col in columns_onehot]
343
- self.columns_binary = [col.upper() for col in columns_binary]
344
- self.columns_ordinal = [col.upper() for col in columns_ordinal]
345
- self.columns_frequency = [col.upper() for col in columns_frequency]
346
- self.target_numbers = target_numbers
347
- self.target_clf = target_clf
348
-
349
- self.time_series = time_series
350
- self.date_column = date_column
351
- self.group_column = group_column
352
- self.val_size = val_size
353
- self.test_size = test_size
354
-
355
- self.experiment_dir = self.experiment.path
356
- self.experiment_id = self.experiment.id
357
- self.data_dir = f"{self.experiment_dir}/data"
358
- self.preprocessing_dir = f"{self.experiment_dir}/preprocessing"
359
-
360
- def run(self):
361
- # Split
362
- train, val, test = (
363
- self.train_val_test_split_time_series()
364
- if self.time_series
365
- else self.train_val_test_split(
366
- stratify_col=f"TARGET_{self.target_numbers[0]}"
367
- )
368
- ) # TODO: only stratifying first target for now
369
-
370
- # PCA
371
- train, pcas = self.add_pca_features(train)
372
- val, _ = self.add_pca_features(val, pcas=pcas)
373
- test, _ = self.add_pca_features(test, pcas=pcas)
374
-
375
- joblib.dump(pcas, f"{self.preprocessing_dir}/pcas.pkl")
376
-
377
- train, pcas_cross_sectional = self.add_pca_feature_cross_sectional(train)
378
- val, _ = self.add_pca_feature_cross_sectional(val, pcas=pcas_cross_sectional)
379
- test, _ = self.add_pca_feature_cross_sectional(test, pcas=pcas_cross_sectional)
380
-
381
- joblib.dump(
382
- pcas_cross_sectional, f"{self.preprocessing_dir}/pcas_cross_sectional.pkl"
383
- )
384
-
385
- train, pcas_temporal = self.add_pca_feature_temporal(train)
386
- val, _ = self.add_pca_feature_temporal(val, pcas=pcas_temporal)
387
- test, _ = self.add_pca_feature_temporal(test, pcas=pcas_temporal)
388
-
389
- joblib.dump(pcas_temporal, f"{self.preprocessing_dir}/pcas_temporal.pkl")
390
-
391
- # Save all features before encoding
392
- joblib.dump(
393
- list(train.columns),
394
- f"{self.preprocessing_dir}/all_features_before_encoding.pkl",
395
- )
396
-
397
- # Encoding
398
- train, transformer = self.encode_categorical_features(train)
399
- val, _ = self.encode_categorical_features(
400
- val,
401
- transformer=transformer,
402
- )
403
- test, _ = self.encode_categorical_features(
404
- test,
405
- transformer=transformer,
406
- )
407
-
408
- joblib.dump(self.data, f"{self.data_dir}/full.pkl")
409
- joblib.dump(transformer, f"{self.preprocessing_dir}/column_transformer.pkl")
410
- summary = summarize_dataframe(train)
411
- summary.to_csv(f"{self.experiment_dir}/feature_summary.csv", index=False)
412
-
413
- # Save all features before selection
414
- joblib.dump(
415
- list(train.columns),
416
- f"{self.preprocessing_dir}/all_features_before_selection.pkl",
417
- )
418
-
419
- return train, val, test
420
-
421
- def inference(self):
422
- data = self.data
423
-
424
- # PCA
425
- if os.path.exists(f"{self.preprocessing_dir}/pcas.pkl"):
426
- pcas = joblib.load(f"{self.preprocessing_dir}/pcas.pkl")
427
- data, _ = self.add_pca_features(data, pcas=pcas)
428
-
429
- if os.path.exists(f"{self.preprocessing_dir}/pcas_cross_sectional.pkl"):
430
- pcas_cross_sectional = joblib.load(
431
- f"{self.preprocessing_dir}/pcas_cross_sectional.pkl"
432
- )
433
- data, _ = self.add_pca_feature_cross_sectional(
434
- data, pcas=pcas_cross_sectional
435
- )
436
-
437
- if os.path.exists(f"{self.preprocessing_dir}/pcas_temporal.pkl"):
438
- pcas_temporal = joblib.load(f"{self.preprocessing_dir}/pcas_temporal.pkl")
439
- data, _ = self.add_pca_feature_temporal(data, pcas=pcas_temporal)
440
-
441
- # Encoding
442
- transformer = joblib.load(f"{self.preprocessing_dir}/column_transformer.pkl")
443
- data, _ = self.encode_categorical_features(
444
- data,
445
- transformer=transformer,
446
- )
447
- return data
448
-
449
- def train_val_test_split_time_series(self):
450
- df: pd.DataFrame = self.data
451
- date_column: str = self.date_column
452
- group_column: str = self.group_column
453
- val_size: float = self.val_size
454
- test_size: float = self.test_size
455
-
456
- if not date_column:
457
- ValueError("Please specify a date_column for time series")
458
-
459
- if group_column:
460
- df.sort_values([date_column, group_column], inplace=True)
461
- else:
462
- df.sort_values(date_column, inplace=True)
463
-
464
- dates = df[date_column].unique()
465
-
466
- val_first_id = int(len(dates) * (1 - val_size - test_size)) + 1
467
- test_first_id = int(len(dates) * (1 - test_size)) + 1
468
-
469
- train = df[df[date_column].isin(dates[:val_first_id])]
470
- val = df[df[date_column].isin(dates[val_first_id:test_first_id])]
471
- test = df[df[date_column].isin(dates[test_first_id:])]
472
-
473
- dates = {}
474
- for name, data in zip(["train", "val", "test"], [train, val, test]):
475
- dates[f"{name}_start_date"] = (
476
- data[date_column].map(pd.Timestamp.fromordinal).iat[0]
477
- )
478
- dates[f"{name}_end_date"] = (
479
- data[date_column].map(pd.Timestamp.fromordinal).iat[-1]
480
- )
481
-
482
- logger.info(
483
- f"{data.shape} {name} data from {dates[f"{name}_start_date"].strftime('%d/%m/%Y')} to {dates[f"{name}_end_date"].strftime('%d/%m/%Y')}"
484
- )
485
-
486
- Experiment.upsert(
487
- match_fields=["id"],
488
- id=self.experiment_id,
489
- train_size=len(train),
490
- val_size=len(val),
491
- test_size=len(test),
492
- **dates,
493
- )
494
- return (
495
- train.reset_index(drop=True),
496
- val.reset_index(drop=True),
497
- test.reset_index(drop=True),
498
- )
499
-
500
- def train_val_test_split(
501
- self,
502
- random_state: int = 42,
503
- stratify_col: str | None = None,
504
- ) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
505
- """
506
- Splits a DataFrame into train, validation, and test sets.
507
-
508
- Parameters:
509
- df (pd.DataFrame): The full experiment
510
- val_size (float): Proportion of validation set (default 0.1)
511
- test_size (float): Proportion of test set (default 0.1)
512
- random_state (int): Random seed for reproducibility
513
- stratify_col (str | None): Optional column to stratify on (for classification tasks)
514
-
515
- Returns:
516
- Tuple of (train_df, val_df, test_df)
517
- """
518
- df: pd.DataFrame = self.data
519
- val_size: float = self.val_size
520
- test_size: float = self.test_size
521
-
522
- stratify_vals = df[stratify_col] if stratify_col else None
523
-
524
- # First split: train + (val + test)
525
- train, temp = train_test_split(
526
- df,
527
- test_size=val_size + test_size,
528
- random_state=random_state,
529
- stratify=stratify_vals,
530
- )
531
-
532
- # Adjust stratify target for val/test split
533
- stratify_temp = temp[stratify_col] if stratify_col else None
534
-
535
- # Compute val and test sizes relative to temp
536
- val_ratio = val_size / (val_size + test_size)
537
-
538
- val, test = train_test_split(
539
- temp,
540
- test_size=1 - val_ratio,
541
- random_state=random_state,
542
- stratify=stratify_temp,
543
- )
544
-
545
- for name, data in zip(["train", "val", "test"], [train, val, test]):
546
- logger.info(f"{data.shape} {name} data")
547
-
548
- Experiment.upsert(
549
- match_fields=["id"],
550
- id=self.experiment_id,
551
- train_size=len(train),
552
- val_size=len(val),
553
- test_size=len(test),
554
- )
555
- return (
556
- train.reset_index(drop=True),
557
- val.reset_index(drop=True),
558
- test.reset_index(drop=True),
559
- )
560
-
561
- # embedding and pca
562
- def add_pca_features(
563
- self, df: pd.DataFrame, n_components: int = 5, pcas=None
564
- ) -> tuple[pd.DataFrame, dict]:
565
- """
566
- Adds PCA components as new columns to a DataFrame from a column containing numpy arrays.
567
- NEED TRAIN/TEST SPLIT BEFORE APPLYING - LIKE ENCODING CATEGORICAL VARIABLES
568
-
569
- Parameters:
570
- df (pd.DataFrame): Input DataFrame
571
- column (str): Name of the column containing np.ndarray
572
- n_components (int): Number of PCA components to keep
573
-
574
- Returns:
575
- pd.DataFrame: DataFrame with new PCA columns added
576
- """
577
- columns: list[str] = self.columns_pca
578
-
579
- pcas_dict = {}
580
- for column in columns:
581
- # Convert text to embeddings if necessary
582
- if not isinstance(df[column].iloc[0], (np.ndarray, list)):
583
- sentences = df[column].astype(str).tolist()
584
- logger.info(
585
- f"Total sentences to embed for column {column}: {len(sentences)}"
586
- )
587
-
588
- # Truncate each sentence
589
- truncate_sentences = [truncate_text(sentence) for sentence in sentences]
590
-
591
- # embedding
592
- embedding_matrix = get_openai_embeddings(truncate_sentences)
593
- else:
594
- logger.info(f"Column {column} is already embeddings")
595
- # Stack the vectors into a 2D array
596
- embedding_matrix = np.vstack(df[column].values)
597
-
598
- # Apply PCA
599
- if pcas:
600
- pca = pcas[column]
601
- pca_features = pca.transform(embedding_matrix)
602
- else:
603
- pca = PCA(n_components=n_components)
604
- pca_features = pca.fit_transform(embedding_matrix)
605
-
606
- # Add PCA columns
607
- for i in range(n_components):
608
- df[f"{column}_pca_{i+1}"] = pca_features[:, i]
609
-
610
- # Drop the original column
611
- df.drop(column, axis=1, inplace=True)
612
- pcas_dict.update({column: pca})
613
-
614
- return df, pcas_dict
615
-
616
- def add_pca_feature_cross_sectional(
617
- self,
618
- df: pd.DataFrame,
619
- *,
620
- n_components: int = 5,
621
- pcas: dict[str, Pipeline] | None = None, # si fourni: transform only
622
- impute_strategy: str = "median",
623
- standardize: bool = True,
624
- ) -> tuple[pd.DataFrame, dict[str, Pipeline]]:
625
- """
626
- Construit un pivot (index=index_col, columns=columns_col, values=value_col),
627
- fit (ou réutilise) un Pipeline Imputer(+Scaler)+PCA, puis merge les scores
628
- (par index_col) dans df. Renvoie (df_avec_features, pipe).
629
- """
630
-
631
- pcas_dict = {}
632
-
633
- for pca_cross_sectional in self.pca_cross_sectional:
634
- name, index_col, columns_col, value_col = (
635
- pca_cross_sectional[k] for k in ("name", "index", "columns", "value")
636
- )
637
- prefix = f"CS_PC_{name}"
638
-
639
- pivot = df.pivot_table(
640
- index=index_col, columns=columns_col, values=value_col
641
- ).sort_index()
642
-
643
- # Pipeline à réutiliser entre train et test
644
- if pcas is None:
645
- steps = [("imputer", SimpleImputer(strategy=impute_strategy))]
646
- if standardize:
647
- steps.append(
648
- ("scaler", StandardScaler(with_mean=True, with_std=True))
649
- )
650
- pca = PCA(n_components=n_components, random_state=0)
651
- steps.append(("pca", pca))
652
- pipe = Pipeline(steps)
653
- pipe.fit(pivot) # <- fit sur TRAIN uniquement
654
- else:
655
- pipe = pcas[name] # <- TEST : on réutilise le pipe existant
656
-
657
- scores = pipe.transform(pivot) # shape: (n_index, n_components)
658
- cols = [f"{prefix}_{i}" for i in range(n_components)]
659
- scores_df = pd.DataFrame(scores, index=pivot.index, columns=cols)
660
-
661
- df = df.merge(scores_df.reset_index(), on=index_col, how="left")
662
- pcas_dict.update({name: pipe})
663
-
664
- return df, pcas_dict
665
-
666
- # ----------------- 2) PCA TEMPORELLE (liste de colonnes lags) ----------------
667
- def add_pca_feature_temporal(
668
- self,
669
- df: pd.DataFrame,
670
- *,
671
- n_components: int = 5,
672
- pcas: dict[str, Pipeline] | None = None, # si fourni: transform only
673
- impute_strategy: (
674
- str | None
675
- ) = None, # None = on exige toutes les colonnes présentes
676
- standardize: bool = True,
677
- ) -> tuple[pd.DataFrame, dict[str, Pipeline]]:
678
- """
679
- Applique une PCA sur une matrice (rows = lignes df, cols = lags).
680
- Fit le Pipeline sur TRAIN si pcas=None; sinon, utilise pcas et fait transform.
681
- Ajoute les colonnes f"{prefix}_{i}" dans df. Renvoie (df, pipe).
682
- """
683
- pcas_dict = {}
684
-
685
- for pca_temporal in self.pca_temporal:
686
- name, cols = (pca_temporal[k] for k in ("name", "columns"))
687
- prefix = f"TMP_PC_{name}"
688
-
689
- # Masque des lignes utilisables
690
- if impute_strategy is None:
691
- mask = (
692
- df[cols].notna().all(axis=1)
693
- ) # on n'impute pas → lignes complètes
694
- X_fit = df.loc[mask, cols]
695
- else:
696
- mask = df[cols].notna().any(axis=1) # on imputera → au moins une valeur
697
- X_fit = df.loc[mask, cols]
698
-
699
- # Pipeline
700
- if pcas is None:
701
- steps = []
702
- if impute_strategy is not None:
703
- steps.append(("imputer", SimpleImputer(strategy=impute_strategy)))
704
- if standardize:
705
- steps.append(
706
- ("scaler", StandardScaler(with_mean=True, with_std=True))
707
- )
708
- pca = PCA(n_components=n_components, random_state=0)
709
- steps.append(("pca", pca))
710
- pipe = Pipeline(steps)
711
- if not X_fit.empty:
712
- pipe.fit(X_fit) # <- fit sur TRAIN uniquement
713
- else:
714
- pipe = pcas[name] # <- TEST
715
-
716
- # Transform uniquement sur lignes valides (mask)
717
- if not df.loc[mask, cols].empty:
718
- Z = pipe.transform(df.loc[mask, cols])
719
- for i in range(n_components):
720
- df.loc[mask, f"{prefix}_{i}"] = Z[:, i]
721
- else:
722
- # crée les colonnes vides si aucune ligne valide (cohérence de schéma)
723
- for i in range(n_components):
724
- df[f"{prefix}_{i}"] = pd.NA
725
-
726
- pcas_dict.update({name: pipe})
727
-
728
- return df, pcas_dict
729
-
730
- # encoding categorical features
731
- def encode_categorical_features(
732
- self,
733
- df: pd.DataFrame,
734
- transformer: ColumnTransformer | None = None,
735
- ) -> tuple[pd.DataFrame, ColumnTransformer]:
736
- """
737
- Encodes categorical columns using one-hot, binary, ordinal, and frequency encoding.
738
-
739
- Parameters:
740
- df (pd.DataFrame): Input DataFrame
741
- columns_onehot (list[str]) Creates one binary column per category forLow-cardinality categorical features
742
- columns_binary (list[str]) Converts categories into binary and splits bits across columns for Mid-to-high cardinality (e.g., 10–100 unique values)
743
- columns_ordinal (list[str]) Assigns integer ranks to categories When order matters (e.g., low < medium < high)
744
- columns_frequency (list[str]) Replaces each category with its frequency count, normalized to proportion. High-cardinality features with meaning in frequency
745
- transformer (ColumnTransformer, optional): if provided, applies transform only
746
-
747
- Returns:
748
- tuple: (transformed DataFrame, ColumnTransformer)
749
- """
750
- columns_onehot: list[str] = self.columns_onehot
751
- columns_binary: list[str] = self.columns_binary
752
- columns_ordinal: list[str] = self.columns_ordinal
753
- columns_frequency: list[str] = self.columns_frequency
754
-
755
- X = df.loc[:, ~df.columns.str.contains("^TARGET_")]
756
- y = df.loc[:, df.columns.str.contains("^TARGET_")]
757
- save_in_db = False
758
-
759
- all_columns = (
760
- columns_onehot + columns_binary + columns_ordinal + columns_frequency
761
- )
762
-
763
- if transformer:
764
- transformed = transformer.transform(X)
765
- else:
766
- transformer = ColumnTransformer(
767
- transformers=[
768
- (
769
- "onehot",
770
- OneHotEncoder(handle_unknown="ignore", sparse_output=False),
771
- columns_onehot,
772
- ),
773
- (
774
- "ordinal",
775
- OrdinalEncoder(
776
- handle_unknown="use_encoded_value", unknown_value=-1
777
- ),
778
- columns_ordinal,
779
- ),
780
- ("binary", BinaryEncoder(handle_unknown="value"), columns_binary),
781
- ("freq", CountEncoder(normalize=True), columns_frequency),
782
- ],
783
- remainder="passthrough",
784
- )
785
- transformed = transformer.fit_transform(X)
786
- save_in_db = True
787
-
788
- # Build output column names
789
- column_names = []
790
-
791
- if columns_onehot:
792
- column_names.extend(
793
- transformer.named_transformers_["onehot"]
794
- .get_feature_names_out(columns_onehot)
795
- .tolist()
796
- )
797
-
798
- if columns_ordinal:
799
- column_names.extend(columns_ordinal)
800
-
801
- if columns_binary:
802
- column_names.extend(
803
- transformer.named_transformers_["binary"]
804
- .get_feature_names_out(columns_binary)
805
- .tolist()
806
- )
807
-
808
- if columns_frequency:
809
- column_names.extend(columns_frequency)
810
-
811
- # Add passthrough (non-encoded) columns
812
- passthrough_columns = [col for col in X.columns if col not in all_columns]
813
- column_names.extend(passthrough_columns)
814
-
815
- X_transformed = pd.DataFrame(transformed, columns=column_names, index=df.index)
816
-
817
- # Try to convert columns to best possible dtypes
818
- X_transformed = X_transformed.convert_dtypes()
819
-
820
- # Insert features in db
821
- if save_in_db:
822
- # Get feature types from transformed data
823
- categorical_features, numerical_features = get_features_by_types(
824
- X_transformed
825
- )
826
-
827
- # Get column names from DataFrames
828
- cat_feature_names = categorical_features.columns.tolist()
829
- num_feature_names = numerical_features.columns.tolist()
830
-
831
- # Combine all feature names and their types
832
- all_feature_names = cat_feature_names + num_feature_names
833
- all_feature_types = ["categorical"] * len(cat_feature_names) + [
834
- "numerical"
835
- ] * len(num_feature_names)
836
-
837
- # Upsert features in bulk if we have any features
838
- if all_feature_names:
839
- Feature.upsert_bulk(
840
- match_fields=["name"],
841
- name=all_feature_names,
842
- type=all_feature_types,
843
- )
844
-
845
- # Upsert targets in bulk
846
- target_names = y.columns.tolist()
847
- target_types = [
848
- (
849
- "classification"
850
- if int(target.split("_")[1]) in self.target_clf
851
- else "regression"
852
- )
853
- for target in target_names
854
- ]
855
-
856
- Target.upsert_bulk(
857
- match_fields=["name"], name=target_names, type=target_types
858
- )
859
-
860
- # Get all the upserted objects
861
- targets = Target.filter(name__in=target_names)
862
-
863
- # Update experiment with targets
864
- experiment = Experiment.get(self.experiment_id)
865
- if experiment:
866
- experiment.targets = targets
867
- experiment.save()
868
-
869
- return pd.concat([X_transformed, y], axis=1), transformer
870
-
871
-
872
326
  # analysis & utils
873
- def summarize_dataframe(
874
- df: pd.DataFrame, sample_categorical_threshold: int = 15
875
- ) -> pd.DataFrame:
876
- summary = []
877
-
878
- def is_hashable_series(series: pd.Series) -> bool:
879
- try:
880
- _ = series.dropna().unique()
881
- return True
882
- except TypeError:
883
- return False
884
-
885
- df = convert_object_columns_that_are_numeric(df)
886
- df = df.convert_dtypes()
887
-
888
- for col in df.columns:
889
- total_missing = df[col].isna().sum()
890
- col_data = df[col].dropna()
891
- dtype = col_data.dtype
892
-
893
- if col_data.empty:
894
- summary.append(
895
- {
896
- "Column": col,
897
- "Dtype": dtype,
898
- "Type": "unknown",
899
- "Detail": "No non-null values",
900
- "Missing": total_missing,
901
- }
902
- )
903
- continue
904
-
905
- # Case 1: Numeric columns
906
- if pd.api.types.is_numeric_dtype(col_data):
907
- unique_vals = col_data.nunique()
908
-
909
- if set(col_data.unique()).issubset({0, 1}):
910
- col_type = "binary-categorical"
911
- detail = "0/1 values only"
912
- elif (
913
- pd.api.types.is_integer_dtype(col_data)
914
- and unique_vals <= sample_categorical_threshold
915
- ):
916
- col_type = "multi-categorical"
917
- top_vals = col_data.value_counts().head(10)
918
- detail = ", ".join(f"{k} ({v})" for k, v in top_vals.items())
919
- else:
920
- col_type = "numeric"
921
- q = col_data.quantile([0, 0.25, 0.5, 0.75, 1])
922
- detail = (
923
- f"Min: {q.iloc[0]:.2f}, Q1: {q.iloc[1]:.2f}, Median: {q.iloc[2]:.2f}, "
924
- f"Q3: {q.iloc[3]:.2f}, Max: {q.iloc[4]:.2f}"
925
- )
926
-
927
- # Case 2: Object or other hashable columns
928
- elif is_hashable_series(col_data):
929
- unique_vals = col_data.nunique()
930
- if unique_vals <= sample_categorical_threshold:
931
- col_type = "object-categorical"
932
- top_vals = col_data.value_counts().head(10)
933
- detail = ", ".join(f"{k} ({v})" for k, v in top_vals.items())
934
- else:
935
- col_type = "high-cardinality-categorical"
936
- detail = f"{unique_vals} unique values"
937
-
938
- # Case 3: Unusable columns
939
- else:
940
- col_type = "non-hashable"
941
- detail = f"Non-hashable type: {type(col_data.iloc[0])}"
942
-
943
- summary.append(
944
- {
945
- "Column": col,
946
- "Dtype": dtype,
947
- "Type": col_type,
948
- "Detail": detail,
949
- "Missing": total_missing,
950
- }
951
- )
952
-
953
- return pd.DataFrame(summary)
954
-
955
-
956
327
  def convert_object_columns_that_are_numeric(df: pd.DataFrame) -> list:
957
328
  """
958
329
  Detect object columns that can be safely converted to numeric (float or int).