lecrapaud 0.21.0__tar.gz → 0.21.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. {lecrapaud-0.21.0 → lecrapaud-0.21.1}/PKG-INFO +1 -1
  2. {lecrapaud-0.21.0 → lecrapaud-0.21.1}/lecrapaud/feature_engineering.py +189 -3
  3. {lecrapaud-0.21.0 → lecrapaud-0.21.1}/pyproject.toml +1 -1
  4. {lecrapaud-0.21.0 → lecrapaud-0.21.1}/LICENSE +0 -0
  5. {lecrapaud-0.21.0 → lecrapaud-0.21.1}/README.md +0 -0
  6. {lecrapaud-0.21.0 → lecrapaud-0.21.1}/lecrapaud/__init__.py +0 -0
  7. {lecrapaud-0.21.0 → lecrapaud-0.21.1}/lecrapaud/api.py +0 -0
  8. {lecrapaud-0.21.0 → lecrapaud-0.21.1}/lecrapaud/config.py +0 -0
  9. {lecrapaud-0.21.0 → lecrapaud-0.21.1}/lecrapaud/db/__init__.py +0 -0
  10. {lecrapaud-0.21.0 → lecrapaud-0.21.1}/lecrapaud/db/alembic/README +0 -0
  11. {lecrapaud-0.21.0 → lecrapaud-0.21.1}/lecrapaud/db/alembic/env.py +0 -0
  12. {lecrapaud-0.21.0 → lecrapaud-0.21.1}/lecrapaud/db/alembic/script.py.mako +0 -0
  13. {lecrapaud-0.21.0 → lecrapaud-0.21.1}/lecrapaud/db/alembic/versions/2025_06_23_1748-f089dfb7e3ba_.py +0 -0
  14. {lecrapaud-0.21.0 → lecrapaud-0.21.1}/lecrapaud/db/alembic/versions/2025_06_24_1216-c62251b129ed_.py +0 -0
  15. {lecrapaud-0.21.0 → lecrapaud-0.21.1}/lecrapaud/db/alembic/versions/2025_06_24_1711-86457e2f333f_.py +0 -0
  16. {lecrapaud-0.21.0 → lecrapaud-0.21.1}/lecrapaud/db/alembic/versions/2025_06_25_1759-72aa496ca65b_.py +0 -0
  17. {lecrapaud-0.21.0 → lecrapaud-0.21.1}/lecrapaud/db/alembic/versions/2025_08_25_1434-7ed9963e732f_add_best_score_to_model_selection.py +0 -0
  18. {lecrapaud-0.21.0 → lecrapaud-0.21.1}/lecrapaud/db/alembic/versions/2025_08_28_1516-c36e9fee22b9_add_avg_precision_to_score.py +0 -0
  19. {lecrapaud-0.21.0 → lecrapaud-0.21.1}/lecrapaud/db/alembic/versions/2025_08_28_1622-8b11c1ba982e_change_name_column.py +0 -0
  20. {lecrapaud-0.21.0 → lecrapaud-0.21.1}/lecrapaud/db/alembic/versions/2025_10_25_0635-07e303521594_add_unique_constraint_to_score.py +0 -0
  21. {lecrapaud-0.21.0 → lecrapaud-0.21.1}/lecrapaud/db/alembic/versions/2025_10_26_1727-033e0f7eca4f_merge_score_and_model_trainings_into_.py +0 -0
  22. {lecrapaud-0.21.0 → lecrapaud-0.21.1}/lecrapaud/db/alembic/versions/2025_10_28_2006-0a8fb7826e9b_add_number_of_targets_and_remove_other_.py +0 -0
  23. {lecrapaud-0.21.0 → lecrapaud-0.21.1}/lecrapaud/db/alembic.ini +0 -0
  24. {lecrapaud-0.21.0 → lecrapaud-0.21.1}/lecrapaud/db/models/__init__.py +0 -0
  25. {lecrapaud-0.21.0 → lecrapaud-0.21.1}/lecrapaud/db/models/base.py +0 -0
  26. {lecrapaud-0.21.0 → lecrapaud-0.21.1}/lecrapaud/db/models/experiment.py +0 -0
  27. {lecrapaud-0.21.0 → lecrapaud-0.21.1}/lecrapaud/db/models/feature.py +0 -0
  28. {lecrapaud-0.21.0 → lecrapaud-0.21.1}/lecrapaud/db/models/feature_selection.py +0 -0
  29. {lecrapaud-0.21.0 → lecrapaud-0.21.1}/lecrapaud/db/models/feature_selection_rank.py +0 -0
  30. {lecrapaud-0.21.0 → lecrapaud-0.21.1}/lecrapaud/db/models/model.py +0 -0
  31. {lecrapaud-0.21.0 → lecrapaud-0.21.1}/lecrapaud/db/models/model_selection.py +0 -0
  32. {lecrapaud-0.21.0 → lecrapaud-0.21.1}/lecrapaud/db/models/model_selection_score.py +0 -0
  33. {lecrapaud-0.21.0 → lecrapaud-0.21.1}/lecrapaud/db/models/target.py +0 -0
  34. {lecrapaud-0.21.0 → lecrapaud-0.21.1}/lecrapaud/db/models/utils.py +0 -0
  35. {lecrapaud-0.21.0 → lecrapaud-0.21.1}/lecrapaud/db/session.py +0 -0
  36. {lecrapaud-0.21.0 → lecrapaud-0.21.1}/lecrapaud/directories.py +0 -0
  37. {lecrapaud-0.21.0 → lecrapaud-0.21.1}/lecrapaud/experiment.py +0 -0
  38. {lecrapaud-0.21.0 → lecrapaud-0.21.1}/lecrapaud/feature_selection.py +0 -0
  39. {lecrapaud-0.21.0 → lecrapaud-0.21.1}/lecrapaud/integrations/openai_integration.py +0 -0
  40. {lecrapaud-0.21.0 → lecrapaud-0.21.1}/lecrapaud/jobs/__init__.py +0 -0
  41. {lecrapaud-0.21.0 → lecrapaud-0.21.1}/lecrapaud/jobs/config.py +0 -0
  42. {lecrapaud-0.21.0 → lecrapaud-0.21.1}/lecrapaud/jobs/scheduler.py +0 -0
  43. {lecrapaud-0.21.0 → lecrapaud-0.21.1}/lecrapaud/jobs/tasks.py +0 -0
  44. {lecrapaud-0.21.0 → lecrapaud-0.21.1}/lecrapaud/misc/tabpfn_tests.ipynb +0 -0
  45. {lecrapaud-0.21.0 → lecrapaud-0.21.1}/lecrapaud/misc/test-gpu-bilstm.ipynb +0 -0
  46. {lecrapaud-0.21.0 → lecrapaud-0.21.1}/lecrapaud/misc/test-gpu-resnet.ipynb +0 -0
  47. {lecrapaud-0.21.0 → lecrapaud-0.21.1}/lecrapaud/misc/test-gpu-transformers.ipynb +0 -0
  48. {lecrapaud-0.21.0 → lecrapaud-0.21.1}/lecrapaud/model_selection.py +0 -0
  49. {lecrapaud-0.21.0 → lecrapaud-0.21.1}/lecrapaud/search_space.py +0 -0
  50. {lecrapaud-0.21.0 → lecrapaud-0.21.1}/lecrapaud/utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: lecrapaud
3
- Version: 0.21.0
3
+ Version: 0.21.1
4
4
  Summary: Framework for machine and deep learning, with regression, classification and time series analysis
5
5
  License: Apache License
6
6
  License-File: LICENSE
@@ -605,7 +605,7 @@ class PreprocessFeature:
605
605
 
606
606
  return df, pcas_dict
607
607
 
608
- def add_pca_feature_cross_sectional(
608
+ def add_pca_feature_cross_sectional_old(
609
609
  self,
610
610
  df: pd.DataFrame,
611
611
  *,
@@ -657,7 +657,7 @@ class PreprocessFeature:
657
657
 
658
658
  return df, pcas_dict
659
659
 
660
- def add_pca_feature_cross_sectional_time_series(
660
+ def add_pca_feature_cross_sectional(
661
661
  self,
662
662
  df: pd.DataFrame,
663
663
  *,
@@ -840,6 +840,11 @@ class PreprocessFeature:
840
840
  # Merger les scores
841
841
  df = df.merge(scores_df, on=index_col, how="left")
842
842
  df.index = index_saved
843
+
844
+ # Forward fill puis 0 pour éviter les NaN
845
+ pca_cols = [col for col in df.columns if col.startswith(prefix)]
846
+ df[pca_cols] = df[pca_cols].fillna(method='ffill').fillna(0)
847
+
843
848
  pcas_dict.update({name: pipe})
844
849
 
845
850
  else:
@@ -873,7 +878,7 @@ class PreprocessFeature:
873
878
  return df, pcas_dict
874
879
 
875
880
  # ----------------- 2) PCA TEMPORELLE (liste de colonnes lags) ----------------
876
- def add_pca_feature_temporal(
881
+ def add_pca_feature_temporal_old(
877
882
  self,
878
883
  df: pd.DataFrame,
879
884
  *,
@@ -936,6 +941,187 @@ class PreprocessFeature:
936
941
 
937
942
  return df, pcas_dict
938
943
 
944
+ def add_pca_feature_temporal(
945
+ self,
946
+ df: pd.DataFrame,
947
+ *,
948
+ n_components: int = 5,
949
+ pcas: dict[str, Pipeline] | None = None,
950
+ impute_strategy: str = "median",
951
+ standardize: bool = True,
952
+ lookback_days: int = 365,
953
+ refresh_frequency: int = 90,
954
+ ) -> tuple[pd.DataFrame, dict[str, Pipeline]]:
955
+ """
956
+ PCA temporelle pour time series avec support panel data.
957
+ Crée automatiquement les colonnes de lags et évite le look-ahead bias.
958
+
959
+ Format pca_temporal simplifié:
960
+ [{"name": "LAST_20_RET", "column": "RET", "lags": 20}]
961
+ """
962
+ pcas_dict = {}
963
+
964
+ for pca_config in self.pca_temporal:
965
+ # Support both old and new format
966
+ if "columns" in pca_config:
967
+ # Old format: use existing columns
968
+ name = pca_config["name"]
969
+ lag_columns = pca_config["columns"]
970
+ base_column = None
971
+ num_lags = len(lag_columns)
972
+ else:
973
+ # New format: create lag columns
974
+ name = pca_config["name"]
975
+ base_column = pca_config["column"].upper()
976
+ num_lags = pca_config.get("lags", 20)
977
+
978
+ # Create lag columns if they don't exist
979
+ if self.group_column:
980
+ # Panel data: create lags by group
981
+ for lag in range(1, num_lags + 1):
982
+ lag_col = f"{base_column}_-{lag}"
983
+ if lag_col not in df.columns:
984
+ df[lag_col] = df.groupby(self.group_column)[base_column].shift(lag)
985
+ else:
986
+ # Simple time series
987
+ for lag in range(1, num_lags + 1):
988
+ lag_col = f"{base_column}_-{lag}"
989
+ if lag_col not in df.columns:
990
+ df[lag_col] = df[base_column].shift(lag)
991
+
992
+ lag_columns = [f"{base_column}_-{i}" for i in range(1, num_lags + 1)]
993
+
994
+ prefix = f"TMP_PC_{name}"
995
+
996
+ # For time series: avoid look-ahead bias
997
+ if self.time_series and self.date_column:
998
+ all_scores = []
999
+ unique_dates = sorted(df[self.date_column].unique())
1000
+
1001
+ if pcas is not None:
1002
+ # Inference: use provided PCA
1003
+ pipe = pcas[name]
1004
+
1005
+ # Apply to all data at once
1006
+ mask = df[lag_columns].notna().all(axis=1)
1007
+ if mask.any():
1008
+ X_transform = df.loc[mask, lag_columns]
1009
+ scores = pipe.transform(X_transform)
1010
+
1011
+ for i in range(n_components):
1012
+ df.loc[mask, f"{prefix}_{i}"] = scores[:, i]
1013
+
1014
+ # Fill NaN with forward fill then 0
1015
+ pca_cols = [f"{prefix}_{i}" for i in range(n_components)]
1016
+ df[pca_cols] = df[pca_cols].fillna(method='ffill').fillna(0)
1017
+
1018
+ else:
1019
+ # Training: expanding window with periodic refresh
1020
+ pipe = None
1021
+ last_fit_date = None
1022
+
1023
+ for current_date_ordinal in unique_dates:
1024
+ current_date = pd.Timestamp.fromordinal(int(current_date_ordinal))
1025
+
1026
+ # Determine if we should refit
1027
+ should_refit = pipe is None or (
1028
+ last_fit_date is not None
1029
+ and (current_date - last_fit_date).days >= refresh_frequency
1030
+ )
1031
+
1032
+ if should_refit and len(df[df[self.date_column] < current_date_ordinal]) > num_lags * 2:
1033
+ # Get historical data for fitting
1034
+ lookback_start = current_date - pd.Timedelta(days=lookback_days)
1035
+ lookback_start_ordinal = pd.Timestamp.toordinal(lookback_start)
1036
+
1037
+ mask_fit = (
1038
+ (df[self.date_column] >= lookback_start_ordinal) &
1039
+ (df[self.date_column] < current_date_ordinal) &
1040
+ df[lag_columns].notna().all(axis=1)
1041
+ )
1042
+
1043
+ if mask_fit.sum() >= n_components:
1044
+ X_fit = df.loc[mask_fit, lag_columns]
1045
+
1046
+ # Create pipeline
1047
+ steps = []
1048
+ if impute_strategy is not None:
1049
+ steps.append(("imputer", SimpleImputer(strategy=impute_strategy)))
1050
+ if standardize:
1051
+ steps.append(("scaler", StandardScaler()))
1052
+ steps.append(("pca", PCA(n_components=n_components, random_state=0)))
1053
+
1054
+ pipe = Pipeline(steps)
1055
+ pipe.fit(X_fit)
1056
+ last_fit_date = current_date
1057
+
1058
+ logger.debug(
1059
+ f"Temporal PCA {name} refitted at {current_date.strftime('%Y-%m-%d')} "
1060
+ f"using {len(X_fit)} samples"
1061
+ )
1062
+
1063
+ # Transform current date data
1064
+ if pipe is not None:
1065
+ mask_current = (
1066
+ (df[self.date_column] == current_date_ordinal) &
1067
+ df[lag_columns].notna().all(axis=1)
1068
+ )
1069
+
1070
+ if mask_current.any():
1071
+ X_current = df.loc[mask_current, lag_columns]
1072
+ scores = pipe.transform(X_current)
1073
+
1074
+ for i in range(n_components):
1075
+ df.loc[mask_current, f"{prefix}_{i}"] = scores[:, i]
1076
+
1077
+ # Fill NaN with forward fill then 0
1078
+ pca_cols = [f"{prefix}_{i}" for i in range(n_components)]
1079
+ for col in pca_cols:
1080
+ if col not in df.columns:
1081
+ df[col] = 0
1082
+ df[pca_cols] = df[pca_cols].fillna(method='ffill').fillna(0)
1083
+
1084
+ pcas_dict[name] = pipe
1085
+
1086
+ else:
1087
+ # Non time-series: use original approach
1088
+ mask = df[lag_columns].notna().all(axis=1)
1089
+
1090
+ if pcas is None and mask.any():
1091
+ X_fit = df.loc[mask, lag_columns]
1092
+
1093
+ steps = []
1094
+ if impute_strategy is not None:
1095
+ steps.append(("imputer", SimpleImputer(strategy=impute_strategy)))
1096
+ if standardize:
1097
+ steps.append(("scaler", StandardScaler()))
1098
+ steps.append(("pca", PCA(n_components=n_components, random_state=0)))
1099
+
1100
+ pipe = Pipeline(steps)
1101
+ pipe.fit(X_fit)
1102
+ pcas_dict[name] = pipe
1103
+ elif pcas is not None:
1104
+ pipe = pcas[name]
1105
+ pcas_dict[name] = pipe
1106
+ else:
1107
+ continue
1108
+
1109
+ if mask.any():
1110
+ X_transform = df.loc[mask, lag_columns]
1111
+ scores = pipe.transform(X_transform)
1112
+
1113
+ for i in range(n_components):
1114
+ df.loc[mask, f"{prefix}_{i}"] = scores[:, i]
1115
+
1116
+ # Fill missing values
1117
+ pca_cols = [f"{prefix}_{i}" for i in range(n_components)]
1118
+ for col in pca_cols:
1119
+ if col not in df.columns:
1120
+ df[col] = 0
1121
+ df[pca_cols] = df[pca_cols].fillna(0)
1122
+
1123
+ return df, pcas_dict
1124
+
939
1125
  # encoding categorical features
940
1126
  def encode_categorical_features(
941
1127
  self,
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "lecrapaud"
3
- version = "0.21.0"
3
+ version = "0.21.1"
4
4
  description = "Framework for machine and deep learning, with regression, classification and time series analysis"
5
5
  authors = [
6
6
  {name = "Pierre H. Gallet"}
File without changes
File without changes
File without changes