lecrapaud 0.20.2__py3-none-any.whl → 0.21.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of lecrapaud might be problematic. Click here for more details.

lecrapaud/__init__.py CHANGED
@@ -1 +1,5 @@
1
1
  from lecrapaud.api import *
2
+
3
+ # Export default parameters for easy access
4
+ from lecrapaud.api import ExperimentEngine
5
+ DEFAULT_EXPERIMENT_PARAMS = ExperimentEngine.DEFAULT_PARAMS
lecrapaud/api.py CHANGED
@@ -163,28 +163,96 @@ class ExperimentEngine:
163
163
  **kwargs: Additional configuration parameters
164
164
  """
165
165
 
166
+ # Default values for all experiment parameters
167
+ DEFAULT_PARAMS = {
168
+ # Feature Engineering
169
+ "columns_drop": [],
170
+ "columns_boolean": [],
171
+ "columns_date": [],
172
+ "columns_te_groupby": [],
173
+ "columns_te_target": [],
174
+ "for_training": True,
175
+ # Preprocessing
176
+ "time_series": False,
177
+ "val_size": 0.2,
178
+ "test_size": 0.2,
179
+ "columns_pca": [],
180
+ "pca_temporal": [],
181
+ "pca_cross_sectional": [],
182
+ "columns_onehot": [],
183
+ "columns_binary": [],
184
+ "columns_ordinal": [],
185
+ "columns_frequency": [],
186
+ # Feature Selection
187
+ "percentile": 20,
188
+ "corr_threshold": 80,
189
+ "max_features": 50,
190
+ "max_p_value_categorical": 0.05,
191
+ # Model Selection
192
+ "target_numbers": [],
193
+ "target_clf": [],
194
+ "models_idx": [],
195
+ "max_timesteps": 120,
196
+ "perform_hyperopt": True,
197
+ "number_of_trials": 20,
198
+ "perform_crossval": False,
199
+ "plot": True,
200
+ "preserve_model": True,
201
+ "target_clf_thresholds": {},
202
+ # Data structure
203
+ "date_column": None,
204
+ "group_column": None,
205
+ }
206
+
166
207
  def __init__(self, id: int = None, data: pd.DataFrame = None, **kwargs):
167
208
  """Initialize the experiment engine with either new or existing experiment."""
168
- # Set all kwargs as instance attributes
169
- if "models_idx" in kwargs:
170
- kwargs["models_idx"] = normalize_models_idx(kwargs["models_idx"])
171
- for key, value in kwargs.items():
172
- setattr(self, key, value)
173
209
 
174
210
  if id:
211
+ # Load existing experiment
175
212
  self.experiment = Experiment.get(id)
176
- kwargs.update(self.experiment.context)
177
- experiment_dir = f"{tmp_dir}/{self.experiment.name}"
178
- preprocessing_dir = f"{experiment_dir}/preprocessing"
179
- data_dir = f"{experiment_dir}/data"
180
- os.makedirs(preprocessing_dir, exist_ok=True)
181
- os.makedirs(data_dir, exist_ok=True)
213
+ # Context from DB takes precedence over kwargs
214
+ effective_kwargs = {
215
+ **self.DEFAULT_PARAMS,
216
+ **kwargs,
217
+ **self.experiment.context,
218
+ }
182
219
  else:
183
220
  if data is None:
184
221
  raise ValueError(
185
222
  "Either id or data must be provided. Data can be a path to a folder containing trained models"
186
223
  )
187
- self.experiment = create_experiment(data=data, **kwargs)
224
+ # New experiment: merge defaults with provided kwargs
225
+ effective_kwargs = {**self.DEFAULT_PARAMS, **kwargs}
226
+
227
+ # Normalize models_idx if present
228
+ if "models_idx" in effective_kwargs:
229
+ effective_kwargs["models_idx"] = normalize_models_idx(
230
+ effective_kwargs["models_idx"]
231
+ )
232
+
233
+ # Set all parameters as instance attributes
234
+ for key, value in effective_kwargs.items():
235
+ setattr(self, key, value)
236
+
237
+ # Create experiment if new
238
+ if not id:
239
+ self.experiment = create_experiment(data=data, **effective_kwargs)
240
+
241
+ # Create directories
242
+ experiment_dir = f"{tmp_dir}/{self.experiment.name}"
243
+ preprocessing_dir = f"{experiment_dir}/preprocessing"
244
+ data_dir = f"{experiment_dir}/data"
245
+ os.makedirs(preprocessing_dir, exist_ok=True)
246
+ os.makedirs(data_dir, exist_ok=True)
247
+
248
+ @classmethod
249
+ def get_default_params(cls):
250
+ """Get the default parameters for experiments."""
251
+ return cls.DEFAULT_PARAMS.copy()
252
+
253
+ def get_effective_context(self):
254
+ """Get the effective context (merged defaults + experiment context)."""
255
+ return {k: getattr(self, k, v) for k, v in self.DEFAULT_PARAMS.items()}
188
256
 
189
257
  def train(self, data, best_params=None):
190
258
  logger.info("Running training...")
@@ -5,38 +5,71 @@ Revises: 033e0f7eca4f
5
5
  Create Date: 2025-10-28 20:06:54.792631
6
6
 
7
7
  """
8
+
8
9
  from typing import Sequence, Union
9
10
 
10
11
  from alembic import op
11
12
  import sqlalchemy as sa
12
13
  from sqlalchemy.dialects import mysql
14
+ from lecrapaud.config import LECRAPAUD_TABLE_PREFIX
13
15
 
14
16
  # revision identifiers, used by Alembic.
15
- revision: str = '0a8fb7826e9b'
16
- down_revision: Union[str, None] = '033e0f7eca4f'
17
+ revision: str = "0a8fb7826e9b"
18
+ down_revision: Union[str, None] = "033e0f7eca4f"
17
19
  branch_labels: Union[str, Sequence[str], None] = None
18
20
  depends_on: Union[str, Sequence[str], None] = None
19
21
 
20
22
 
21
23
  def upgrade() -> None:
22
24
  # ### commands auto generated by Alembic - please adjust! ###
23
- op.add_column('lecrapaud_experiments', sa.Column('number_of_targets', sa.Integer(), nullable=True))
24
- op.drop_column('lecrapaud_experiments', 'corr_threshold')
25
- op.drop_column('lecrapaud_experiments', 'max_features')
26
- op.drop_column('lecrapaud_experiments', 'percentile')
27
- op.drop_column('lecrapaud_experiments', 'type')
28
- op.drop_index(op.f('ix_model_selection_scores_id'), table_name='lecrapaud_model_selection_scores')
29
- op.create_index(op.f('ix_lecrapaud_model_selection_scores_id'), 'lecrapaud_model_selection_scores', ['id'], unique=False)
25
+ op.add_column(
26
+ f"{LECRAPAUD_TABLE_PREFIX}_experiments",
27
+ sa.Column("number_of_targets", sa.Integer(), nullable=True),
28
+ )
29
+ op.drop_column(f"{LECRAPAUD_TABLE_PREFIX}_experiments", "corr_threshold")
30
+ op.drop_column(f"{LECRAPAUD_TABLE_PREFIX}_experiments", "max_features")
31
+ op.drop_column(f"{LECRAPAUD_TABLE_PREFIX}_experiments", "percentile")
32
+ op.drop_column(f"{LECRAPAUD_TABLE_PREFIX}_experiments", "type")
33
+ op.drop_index(
34
+ op.f("ix_model_selection_scores_id"),
35
+ table_name=f"{LECRAPAUD_TABLE_PREFIX}_model_selection_scores",
36
+ )
37
+ op.create_index(
38
+ op.f("ix_model_selection_scores_id"),
39
+ f"{LECRAPAUD_TABLE_PREFIX}_model_selection_scores",
40
+ ["id"],
41
+ unique=False,
42
+ )
30
43
  # ### end Alembic commands ###
31
44
 
32
45
 
33
46
  def downgrade() -> None:
34
47
  # ### commands auto generated by Alembic - please adjust! ###
35
- op.drop_index(op.f('ix_lecrapaud_model_selection_scores_id'), table_name='lecrapaud_model_selection_scores')
36
- op.create_index(op.f('ix_model_selection_scores_id'), 'lecrapaud_model_selection_scores', ['id'], unique=False)
37
- op.add_column('lecrapaud_experiments', sa.Column('type', mysql.VARCHAR(length=50), nullable=False))
38
- op.add_column('lecrapaud_experiments', sa.Column('percentile', mysql.FLOAT(), nullable=False))
39
- op.add_column('lecrapaud_experiments', sa.Column('max_features', mysql.INTEGER(), autoincrement=False, nullable=False))
40
- op.add_column('lecrapaud_experiments', sa.Column('corr_threshold', mysql.FLOAT(), nullable=False))
41
- op.drop_column('lecrapaud_experiments', 'number_of_targets')
48
+ op.drop_index(
49
+ op.f("ix_lecrapaud_model_selection_scores_id"),
50
+ table_name=f"{LECRAPAUD_TABLE_PREFIX}_model_selection_scores",
51
+ )
52
+ op.create_index(
53
+ op.f("ix_model_selection_scores_id"),
54
+ f"{LECRAPAUD_TABLE_PREFIX}_model_selection_scores",
55
+ ["id"],
56
+ unique=False,
57
+ )
58
+ op.add_column(
59
+ f"{LECRAPAUD_TABLE_PREFIX}_experiments",
60
+ sa.Column("type", mysql.VARCHAR(length=50), nullable=False),
61
+ )
62
+ op.add_column(
63
+ f"{LECRAPAUD_TABLE_PREFIX}_experiments",
64
+ sa.Column("percentile", mysql.FLOAT(), nullable=False),
65
+ )
66
+ op.add_column(
67
+ f"{LECRAPAUD_TABLE_PREFIX}_experiments",
68
+ sa.Column("max_features", mysql.INTEGER(), autoincrement=False, nullable=False),
69
+ )
70
+ op.add_column(
71
+ f"{LECRAPAUD_TABLE_PREFIX}_experiments",
72
+ sa.Column("corr_threshold", mysql.FLOAT(), nullable=False),
73
+ )
74
+ op.drop_column(f"{LECRAPAUD_TABLE_PREFIX}_experiments", "number_of_targets")
42
75
  # ### end Alembic commands ###
lecrapaud/experiment.py CHANGED
@@ -28,6 +28,9 @@ def create_experiment(
28
28
  if kwargs.get("time_series") and not date_column:
29
29
  raise ValueError("date_column must be provided for time series experiments")
30
30
 
31
+ if experiment_name is None:
32
+ raise ValueError("experiment_name must be provided")
33
+
31
34
  dates = {}
32
35
  if date_column:
33
36
  dates["start_date"] = pd.to_datetime(data[date_column].iat[0])
@@ -94,7 +94,7 @@ class FeatureEngineeringEngine:
94
94
  self.data = data
95
95
  self.experiment = experiment
96
96
  self.for_training = for_training
97
-
97
+
98
98
  # Get all parameters from experiment context
99
99
  self.columns_drop = self.experiment.context.get("columns_drop", [])
100
100
  self.columns_boolean = self.experiment.context.get("columns_boolean", [])
@@ -330,15 +330,19 @@ class PreprocessFeature:
330
330
  self.test_size = context.get("test_size", 0.2)
331
331
  self.target_numbers = context.get("target_numbers", [])
332
332
  self.target_clf = context.get("target_clf", [])
333
-
333
+
334
334
  # Handle list parameters with uppercase conversion
335
335
  self.columns_pca = [col.upper() for col in context.get("columns_pca", [])]
336
336
  self.pca_temporal = context.get("pca_temporal", [])
337
337
  self.pca_cross_sectional = context.get("pca_cross_sectional", [])
338
338
  self.columns_onehot = [col.upper() for col in context.get("columns_onehot", [])]
339
339
  self.columns_binary = [col.upper() for col in context.get("columns_binary", [])]
340
- self.columns_ordinal = [col.upper() for col in context.get("columns_ordinal", [])]
341
- self.columns_frequency = [col.upper() for col in context.get("columns_frequency", [])]
340
+ self.columns_ordinal = [
341
+ col.upper() for col in context.get("columns_ordinal", [])
342
+ ]
343
+ self.columns_frequency = [
344
+ col.upper() for col in context.get("columns_frequency", [])
345
+ ]
342
346
 
343
347
  self.experiment_dir = self.experiment.path
344
348
  self.experiment_id = self.experiment.id
@@ -601,7 +605,7 @@ class PreprocessFeature:
601
605
 
602
606
  return df, pcas_dict
603
607
 
604
- def add_pca_feature_cross_sectional(
608
+ def add_pca_feature_cross_sectional_old(
605
609
  self,
606
610
  df: pd.DataFrame,
607
611
  *,
@@ -653,8 +657,228 @@ class PreprocessFeature:
653
657
 
654
658
  return df, pcas_dict
655
659
 
660
+ def add_pca_feature_cross_sectional(
661
+ self,
662
+ df: pd.DataFrame,
663
+ *,
664
+ n_components: int = 5,
665
+ pcas: dict[str, Pipeline] | None = None, # si fourni: transform only
666
+ impute_strategy: str = "median",
667
+ standardize: bool = True,
668
+ lookback_days: int = 365, # nombre de jours à regarder en arrière pour le fit
669
+ refresh_frequency: int = 90, # refresh la PCA tous les X jours
670
+ ) -> tuple[pd.DataFrame, dict[str, Pipeline]]:
671
+ """
672
+ Construit un pivot (index=index_col, columns=columns_col, values=value_col),
673
+ fit (ou réutilise) un Pipeline Imputer(+Scaler)+PCA, puis merge les scores
674
+ (par index_col) dans df. Renvoie (df_avec_features, pipe).
675
+
676
+ Pour les séries temporelles : fit la PCA uniquement sur les données passées
677
+ pour éviter le leakage, avec refresh périodique.
678
+
679
+ Gère le cas des données panel où on a plusieurs séries temporelles
680
+ (ex: plusieurs stocks avec les mêmes dates).
681
+ """
682
+
683
+ pcas_dict = {}
684
+ index_saved = df.index
685
+
686
+ for pca_cross_sectional in self.pca_cross_sectional:
687
+ name, index_col, columns_col, value_col = (
688
+ pca_cross_sectional[k] for k in ("name", "index", "columns", "value")
689
+ )
690
+ prefix = f"CS_PC_{name}"
691
+
692
+ # Vérifier si c'est une série temporelle avec index = date
693
+ # Les dates sont déjà en ordinal après cyclic_encode_date
694
+ is_time_series = self.time_series and index_col == self.date_column
695
+
696
+ if is_time_series:
697
+ # Cas spécial : PCA cross-sectional sur des données de panel time series
698
+ # Par exemple : PCA sur les returns de tous les stocks à chaque date
699
+ # pour capturer le régime de marché
700
+
701
+ all_scores = []
702
+
703
+ # Les dates sont déjà en ordinal
704
+ unique_dates = sorted(df[index_col].unique())
705
+
706
+ # Pour l'inference, utiliser la PCA fournie
707
+ if pcas is not None:
708
+ pipe = pcas[name]
709
+ pivot = df.pivot_table(
710
+ index=index_col, columns=columns_col, values=value_col
711
+ ).sort_index()
712
+ scores = pipe.transform(pivot)
713
+ cols = [f"{prefix}_{i}" for i in range(n_components)]
714
+ scores_df = pd.DataFrame(scores, index=pivot.index, columns=cols)
715
+ else:
716
+ # Training : fit PCA de manière expanding avec refresh périodique
717
+ pipe = None
718
+ last_fit_date = None
719
+
720
+ for i, current_date_ordinal in enumerate(unique_dates):
721
+ # Convertir l'ordinal en date pour les calculs de temps
722
+ current_date = pd.Timestamp.fromordinal(
723
+ int(current_date_ordinal)
724
+ )
725
+
726
+ # Déterminer si on doit refitter la PCA
727
+ should_refit = pipe is None or ( # Première fois
728
+ last_fit_date is not None
729
+ and (current_date - last_fit_date).days >= refresh_frequency
730
+ )
731
+
732
+ if (
733
+ should_refit and i > 30
734
+ ): # Attendre au moins 30 jours de données
735
+ # Prendre les données des 'lookback_days' derniers jours
736
+ lookback_start_date = current_date - pd.Timedelta(
737
+ days=lookback_days
738
+ )
739
+ lookback_start_ordinal = pd.Timestamp.toordinal(
740
+ lookback_start_date
741
+ )
742
+
743
+ # Masque pour les dates passées uniquement (éviter le leakage)
744
+ mask_fit = (df[index_col] >= lookback_start_ordinal) & (
745
+ df[index_col] < current_date_ordinal
746
+ )
747
+ df_fit = df[mask_fit]
748
+
749
+ if len(df_fit) > 0:
750
+ # Créer le pivot pour la période de lookback
751
+ pivot_fit = df_fit.pivot_table(
752
+ index=index_col,
753
+ columns=columns_col,
754
+ values=value_col,
755
+ ).sort_index()
756
+
757
+ # Vérifier qu'on a assez de dates et de colonnes
758
+ if (
759
+ len(pivot_fit) >= n_components
760
+ and pivot_fit.shape[1] >= n_components
761
+ ):
762
+ # Créer nouveau pipeline
763
+ steps = [
764
+ (
765
+ "imputer",
766
+ SimpleImputer(strategy=impute_strategy),
767
+ )
768
+ ]
769
+ if standardize:
770
+ steps.append(
771
+ (
772
+ "scaler",
773
+ StandardScaler(
774
+ with_mean=True, with_std=True
775
+ ),
776
+ )
777
+ )
778
+ pca = PCA(n_components=n_components, random_state=0)
779
+ steps.append(("pca", pca))
780
+ pipe = Pipeline(steps)
781
+ pipe.fit(pivot_fit)
782
+ last_fit_date = current_date
783
+
784
+ logger.debug(
785
+ f"PCA {name} refitted at date {current_date.strftime('%Y-%m-%d')} "
786
+ f"using {len(pivot_fit)} dates and {pivot_fit.shape[1]} columns"
787
+ )
788
+
789
+ # Transform pour la date courante uniquement
790
+ if pipe is not None:
791
+ df_current = df[df[index_col] == current_date_ordinal]
792
+ if len(df_current) > 0:
793
+ pivot_current = df_current.pivot_table(
794
+ index=index_col,
795
+ columns=columns_col,
796
+ values=value_col,
797
+ )
798
+ try:
799
+ scores_current = pipe.transform(pivot_current)
800
+ scores_dict = {
801
+ index_col: [current_date_ordinal],
802
+ **{
803
+ f"{prefix}_{j}": [scores_current[0, j]]
804
+ for j in range(n_components)
805
+ },
806
+ }
807
+ all_scores.append(pd.DataFrame(scores_dict))
808
+ except Exception as e:
809
+ # En cas d'erreur (ex: nouvelles colonnes), créer des valeurs manquantes
810
+ logger.debug(
811
+ f"PCA transform error at date {current_date}: {str(e)}"
812
+ )
813
+ scores_dict = {
814
+ index_col: [current_date_ordinal],
815
+ **{
816
+ f"{prefix}_{j}": [np.nan]
817
+ for j in range(n_components)
818
+ },
819
+ }
820
+ all_scores.append(pd.DataFrame(scores_dict))
821
+ else:
822
+ # Pas encore de PCA fittée, créer des NaN
823
+ scores_dict = {
824
+ index_col: [current_date_ordinal],
825
+ **{
826
+ f"{prefix}_{j}": [np.nan]
827
+ for j in range(n_components)
828
+ },
829
+ }
830
+ all_scores.append(pd.DataFrame(scores_dict))
831
+
832
+ # Combiner tous les scores
833
+ if all_scores:
834
+ scores_df = pd.concat(all_scores, ignore_index=True)
835
+ else:
836
+ # Créer un DataFrame vide avec les bonnes colonnes
837
+ cols = [f"{prefix}_{i}" for i in range(n_components)]
838
+ scores_df = pd.DataFrame(columns=[index_col] + cols)
839
+
840
+ # Merger les scores
841
+ df = df.merge(scores_df, on=index_col, how="left")
842
+ df.index = index_saved
843
+
844
+ # Forward fill puis 0 pour éviter les NaN
845
+ pca_cols = [col for col in df.columns if col.startswith(prefix)]
846
+ df[pca_cols] = df[pca_cols].fillna(method='ffill').fillna(0)
847
+
848
+ pcas_dict.update({name: pipe})
849
+
850
+ else:
851
+ # Approche classique (non time series ou index != date)
852
+ pivot = df.pivot_table(
853
+ index=index_col, columns=columns_col, values=value_col
854
+ ).sort_index()
855
+
856
+ # Pipeline à réutiliser entre train et test
857
+ if pcas is None:
858
+ steps = [("imputer", SimpleImputer(strategy=impute_strategy))]
859
+ if standardize:
860
+ steps.append(
861
+ ("scaler", StandardScaler(with_mean=True, with_std=True))
862
+ )
863
+ pca = PCA(n_components=n_components, random_state=0)
864
+ steps.append(("pca", pca))
865
+ pipe = Pipeline(steps)
866
+ pipe.fit(pivot) # <- fit sur TRAIN uniquement
867
+ else:
868
+ pipe = pcas[name] # <- TEST : on réutilise le pipe existant
869
+
870
+ scores = pipe.transform(pivot) # shape: (n_index, n_components)
871
+ cols = [f"{prefix}_{i}" for i in range(n_components)]
872
+ scores_df = pd.DataFrame(scores, index=pivot.index, columns=cols)
873
+
874
+ df = df.merge(scores_df.reset_index(), on=index_col, how="left")
875
+ df.index = index_saved
876
+ pcas_dict.update({name: pipe})
877
+
878
+ return df, pcas_dict
879
+
656
880
  # ----------------- 2) PCA TEMPORELLE (liste de colonnes lags) ----------------
657
- def add_pca_feature_temporal(
881
+ def add_pca_feature_temporal_old(
658
882
  self,
659
883
  df: pd.DataFrame,
660
884
  *,
@@ -717,6 +941,187 @@ class PreprocessFeature:
717
941
 
718
942
  return df, pcas_dict
719
943
 
944
+ def add_pca_feature_temporal(
945
+ self,
946
+ df: pd.DataFrame,
947
+ *,
948
+ n_components: int = 5,
949
+ pcas: dict[str, Pipeline] | None = None,
950
+ impute_strategy: str = "median",
951
+ standardize: bool = True,
952
+ lookback_days: int = 365,
953
+ refresh_frequency: int = 90,
954
+ ) -> tuple[pd.DataFrame, dict[str, Pipeline]]:
955
+ """
956
+ PCA temporelle pour time series avec support panel data.
957
+ Crée automatiquement les colonnes de lags et évite le look-ahead bias.
958
+
959
+ Format pca_temporal simplifié:
960
+ [{"name": "LAST_20_RET", "column": "RET", "lags": 20}]
961
+ """
962
+ pcas_dict = {}
963
+
964
+ for pca_config in self.pca_temporal:
965
+ # Support both old and new format
966
+ if "columns" in pca_config:
967
+ # Old format: use existing columns
968
+ name = pca_config["name"]
969
+ lag_columns = pca_config["columns"]
970
+ base_column = None
971
+ num_lags = len(lag_columns)
972
+ else:
973
+ # New format: create lag columns
974
+ name = pca_config["name"]
975
+ base_column = pca_config["column"].upper()
976
+ num_lags = pca_config.get("lags", 20)
977
+
978
+ # Create lag columns if they don't exist
979
+ if self.group_column:
980
+ # Panel data: create lags by group
981
+ for lag in range(1, num_lags + 1):
982
+ lag_col = f"{base_column}_-{lag}"
983
+ if lag_col not in df.columns:
984
+ df[lag_col] = df.groupby(self.group_column)[base_column].shift(lag)
985
+ else:
986
+ # Simple time series
987
+ for lag in range(1, num_lags + 1):
988
+ lag_col = f"{base_column}_-{lag}"
989
+ if lag_col not in df.columns:
990
+ df[lag_col] = df[base_column].shift(lag)
991
+
992
+ lag_columns = [f"{base_column}_-{i}" for i in range(1, num_lags + 1)]
993
+
994
+ prefix = f"TMP_PC_{name}"
995
+
996
+ # For time series: avoid look-ahead bias
997
+ if self.time_series and self.date_column:
998
+ all_scores = []
999
+ unique_dates = sorted(df[self.date_column].unique())
1000
+
1001
+ if pcas is not None:
1002
+ # Inference: use provided PCA
1003
+ pipe = pcas[name]
1004
+
1005
+ # Apply to all data at once
1006
+ mask = df[lag_columns].notna().all(axis=1)
1007
+ if mask.any():
1008
+ X_transform = df.loc[mask, lag_columns]
1009
+ scores = pipe.transform(X_transform)
1010
+
1011
+ for i in range(n_components):
1012
+ df.loc[mask, f"{prefix}_{i}"] = scores[:, i]
1013
+
1014
+ # Fill NaN with forward fill then 0
1015
+ pca_cols = [f"{prefix}_{i}" for i in range(n_components)]
1016
+ df[pca_cols] = df[pca_cols].fillna(method='ffill').fillna(0)
1017
+
1018
+ else:
1019
+ # Training: expanding window with periodic refresh
1020
+ pipe = None
1021
+ last_fit_date = None
1022
+
1023
+ for current_date_ordinal in unique_dates:
1024
+ current_date = pd.Timestamp.fromordinal(int(current_date_ordinal))
1025
+
1026
+ # Determine if we should refit
1027
+ should_refit = pipe is None or (
1028
+ last_fit_date is not None
1029
+ and (current_date - last_fit_date).days >= refresh_frequency
1030
+ )
1031
+
1032
+ if should_refit and len(df[df[self.date_column] < current_date_ordinal]) > num_lags * 2:
1033
+ # Get historical data for fitting
1034
+ lookback_start = current_date - pd.Timedelta(days=lookback_days)
1035
+ lookback_start_ordinal = pd.Timestamp.toordinal(lookback_start)
1036
+
1037
+ mask_fit = (
1038
+ (df[self.date_column] >= lookback_start_ordinal) &
1039
+ (df[self.date_column] < current_date_ordinal) &
1040
+ df[lag_columns].notna().all(axis=1)
1041
+ )
1042
+
1043
+ if mask_fit.sum() >= n_components:
1044
+ X_fit = df.loc[mask_fit, lag_columns]
1045
+
1046
+ # Create pipeline
1047
+ steps = []
1048
+ if impute_strategy is not None:
1049
+ steps.append(("imputer", SimpleImputer(strategy=impute_strategy)))
1050
+ if standardize:
1051
+ steps.append(("scaler", StandardScaler()))
1052
+ steps.append(("pca", PCA(n_components=n_components, random_state=0)))
1053
+
1054
+ pipe = Pipeline(steps)
1055
+ pipe.fit(X_fit)
1056
+ last_fit_date = current_date
1057
+
1058
+ logger.debug(
1059
+ f"Temporal PCA {name} refitted at {current_date.strftime('%Y-%m-%d')} "
1060
+ f"using {len(X_fit)} samples"
1061
+ )
1062
+
1063
+ # Transform current date data
1064
+ if pipe is not None:
1065
+ mask_current = (
1066
+ (df[self.date_column] == current_date_ordinal) &
1067
+ df[lag_columns].notna().all(axis=1)
1068
+ )
1069
+
1070
+ if mask_current.any():
1071
+ X_current = df.loc[mask_current, lag_columns]
1072
+ scores = pipe.transform(X_current)
1073
+
1074
+ for i in range(n_components):
1075
+ df.loc[mask_current, f"{prefix}_{i}"] = scores[:, i]
1076
+
1077
+ # Fill NaN with forward fill then 0
1078
+ pca_cols = [f"{prefix}_{i}" for i in range(n_components)]
1079
+ for col in pca_cols:
1080
+ if col not in df.columns:
1081
+ df[col] = 0
1082
+ df[pca_cols] = df[pca_cols].fillna(method='ffill').fillna(0)
1083
+
1084
+ pcas_dict[name] = pipe
1085
+
1086
+ else:
1087
+ # Non time-series: use original approach
1088
+ mask = df[lag_columns].notna().all(axis=1)
1089
+
1090
+ if pcas is None and mask.any():
1091
+ X_fit = df.loc[mask, lag_columns]
1092
+
1093
+ steps = []
1094
+ if impute_strategy is not None:
1095
+ steps.append(("imputer", SimpleImputer(strategy=impute_strategy)))
1096
+ if standardize:
1097
+ steps.append(("scaler", StandardScaler()))
1098
+ steps.append(("pca", PCA(n_components=n_components, random_state=0)))
1099
+
1100
+ pipe = Pipeline(steps)
1101
+ pipe.fit(X_fit)
1102
+ pcas_dict[name] = pipe
1103
+ elif pcas is not None:
1104
+ pipe = pcas[name]
1105
+ pcas_dict[name] = pipe
1106
+ else:
1107
+ continue
1108
+
1109
+ if mask.any():
1110
+ X_transform = df.loc[mask, lag_columns]
1111
+ scores = pipe.transform(X_transform)
1112
+
1113
+ for i in range(n_components):
1114
+ df.loc[mask, f"{prefix}_{i}"] = scores[:, i]
1115
+
1116
+ # Fill missing values
1117
+ pca_cols = [f"{prefix}_{i}" for i in range(n_components)]
1118
+ for col in pca_cols:
1119
+ if col not in df.columns:
1120
+ df[col] = 0
1121
+ df[pca_cols] = df[pca_cols].fillna(0)
1122
+
1123
+ return df, pcas_dict
1124
+
720
1125
  # encoding categorical features
721
1126
  def encode_categorical_features(
722
1127
  self,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: lecrapaud
3
- Version: 0.20.2
3
+ Version: 0.21.1
4
4
  Summary: Framework for machine and deep learning, with regression, classification and time series analysis
5
5
  License: Apache License
6
6
  License-File: LICENSE
@@ -12,9 +12,12 @@ Classifier: Programming Language :: Python :: 3.12
12
12
  Requires-Dist: catboost (>=1.2.8)
13
13
  Requires-Dist: category-encoders (>=2.8.1)
14
14
  Requires-Dist: celery (>=5.5.3)
15
+ Requires-Dist: celery-redbeat (>=2.3.2)
15
16
  Requires-Dist: ftfy (>=6.3.1)
17
+ Requires-Dist: hyperopt (>=0.2.7)
16
18
  Requires-Dist: joblib (>=1.5.1)
17
19
  Requires-Dist: keras (>=3.10.0)
20
+ Requires-Dist: keras-tcn (>=3.5.6)
18
21
  Requires-Dist: lightgbm (>=4.6.0)
19
22
  Requires-Dist: matplotlib (>=3.10.3)
20
23
  Requires-Dist: mlxtend (>=0.23.4)
@@ -1,5 +1,5 @@
1
- lecrapaud/__init__.py,sha256=oCxbtw_nk8rlOXbXbWo0RRMlsh6w-hTiZ6e5PRG_wp0,28
2
- lecrapaud/api.py,sha256=IQlH3wcSzxYgvlamfICNMwNsQGoaNxBJUPTlC9M0kBk,20321
1
+ lecrapaud/__init__.py,sha256=7Wp_VF08UZP8o-GkpB4_yRjP4twQmpcTc3202OkPmHs,176
2
+ lecrapaud/api.py,sha256=7OL_wbg9hCmlZ0WI6eCDkublntES3f320OZlpuKu8f4,22376
3
3
  lecrapaud/config.py,sha256=0NEg61QdLxQ97bVFDDXa6OwlWFEo_z8VIhX5KrD1ik0,1170
4
4
  lecrapaud/db/__init__.py,sha256=82o9fMfaqKXPh2_rt44EzNRVZV1R4LScEnQYvj_TjK0,34
5
5
  lecrapaud/db/alembic/README,sha256=MVlc9TYmr57RbhXET6QxgyCcwWP7w-vLkEsirENqiIQ,38
@@ -14,7 +14,7 @@ lecrapaud/db/alembic/versions/2025_08_28_1516-c36e9fee22b9_add_avg_precision_to_
14
14
  lecrapaud/db/alembic/versions/2025_08_28_1622-8b11c1ba982e_change_name_column.py,sha256=g6H2Z9MwB6UEiqdGlBoHBXpO9DTaWkwHt8FS6joVOm0,1191
15
15
  lecrapaud/db/alembic/versions/2025_10_25_0635-07e303521594_add_unique_constraint_to_score.py,sha256=FshOF1t-NWXrBtXT3wMNGFslJ4sWUxzvBEXSymu05cI,1043
16
16
  lecrapaud/db/alembic/versions/2025_10_26_1727-033e0f7eca4f_merge_score_and_model_trainings_into_.py,sha256=htHUD4zPJr-0z_DQfTi8r9RsFVe9m7SL0f7oRIvLIcQ,10999
17
- lecrapaud/db/alembic/versions/2025_10_28_2006-0a8fb7826e9b_add_number_of_targets_and_remove_other_.py,sha256=o3TNHq1GTFjxfk2zHWaUbq91khMJi6Xy6HToO9i54AU,2051
17
+ lecrapaud/db/alembic/versions/2025_10_28_2006-0a8fb7826e9b_add_number_of_targets_and_remove_other_.py,sha256=0NBvOwPqMXpWnDEGiEBk_IeLKmXQ5ZcU-dqHeSEgsRQ,2557
18
18
  lecrapaud/db/alembic.ini,sha256=Zw2rdwsKV6c7J1SPtoFIPDX08_oTP3MuUKnNxBDiY8I,3796
19
19
  lecrapaud/db/models/__init__.py,sha256=-XoCN1eeLihnNxBMl90lXrgrTSDkMbeqgienMqFi5f4,702
20
20
  lecrapaud/db/models/base.py,sha256=0548x4ftd6Oim9BJmtD7Er4izM6u0QCrlTG5560384w,9458
@@ -29,8 +29,8 @@ lecrapaud/db/models/target.py,sha256=DKnfeaLU8eT8J_oh_vuFo5-o1CaoXR13xBbswme6Bgk
29
29
  lecrapaud/db/models/utils.py,sha256=-a-nWWmpJ2XzidIxo2COVUTrGZIPYCfBzjhcszJj_bM,1109
30
30
  lecrapaud/db/session.py,sha256=u9NCwUoV5VbtScRb6HOSQr4oTEjIwj0waP5mGlc1qJg,3735
31
31
  lecrapaud/directories.py,sha256=0LrANuDgbuneSLker60c6q2hmGnQ3mKHIztTGzTx6Gw,826
32
- lecrapaud/experiment.py,sha256=hhi6NdVKtxoyx_AGBB4iNEZZpd9b3rKs23qiLPf-mUk,2384
33
- lecrapaud/feature_engineering.py,sha256=UM-EIOsgYWedqsR9uA-09eaWSb9FofVxoE0rRcDelQ8,39173
32
+ lecrapaud/experiment.py,sha256=LiecZS3P4igO_3nJ4IB-2b25CttQS2RePDnhBNucvdE,2478
33
+ lecrapaud/feature_engineering.py,sha256=SvGrJXv24rVgH0QE5mRwJITcCLfUqgbV2Ep68bBVnJs,58794
34
34
  lecrapaud/feature_selection.py,sha256=Q9xWVmZsvRjX9mJHB_PY_KLXsEAYNLX7txSe0cniY4A,47529
35
35
  lecrapaud/integrations/openai_integration.py,sha256=hHLF3fk5Bps8KNbNrEL3NUFa945jwClE6LrLpuMZOd4,7459
36
36
  lecrapaud/jobs/__init__.py,sha256=ZkrsyTOR21c_wN7RY8jPhm8jCrL1oCEtTsf3VFIlQiE,292
@@ -44,7 +44,7 @@ lecrapaud/misc/test-gpu-transformers.ipynb,sha256=k6MBSs_Um1h4PykvE-LTBcdpbWLbIF
44
44
  lecrapaud/model_selection.py,sha256=o4_hOEp91_33HtMatVHU7YPc71KZ2hK7wucN63xqWkA,88017
45
45
  lecrapaud/search_space.py,sha256=caCehJklD3-sgmlisJj_GmuB7LJiVvTF71gEjPGDvV4,36336
46
46
  lecrapaud/utils.py,sha256=0k76HFETO0_NgCYUv8b3RTBLgry6MsDBaHJfpAplxCY,8855
47
- lecrapaud-0.20.2.dist-info/METADATA,sha256=FUXEVYVCJAoat8HUtsupISlRbK56YVxezYwCH6j4kBE,14239
48
- lecrapaud-0.20.2.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
49
- lecrapaud-0.20.2.dist-info/licenses/LICENSE,sha256=MImCryu0AnqhJE_uAZD-PIDKXDKb8sT7v0i1NOYeHTM,11350
50
- lecrapaud-0.20.2.dist-info/RECORD,,
47
+ lecrapaud-0.21.1.dist-info/METADATA,sha256=rKls8xvjhu9f72jTw2sjBYCmQPw-N02RSScSOjJ1E2g,14348
48
+ lecrapaud-0.21.1.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
49
+ lecrapaud-0.21.1.dist-info/licenses/LICENSE,sha256=MImCryu0AnqhJE_uAZD-PIDKXDKb8sT7v0i1NOYeHTM,11350
50
+ lecrapaud-0.21.1.dist-info/RECORD,,