lecrapaud 0.21.0__tar.gz → 0.21.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {lecrapaud-0.21.0 → lecrapaud-0.21.1}/PKG-INFO +1 -1
- {lecrapaud-0.21.0 → lecrapaud-0.21.1}/lecrapaud/feature_engineering.py +189 -3
- {lecrapaud-0.21.0 → lecrapaud-0.21.1}/pyproject.toml +1 -1
- {lecrapaud-0.21.0 → lecrapaud-0.21.1}/LICENSE +0 -0
- {lecrapaud-0.21.0 → lecrapaud-0.21.1}/README.md +0 -0
- {lecrapaud-0.21.0 → lecrapaud-0.21.1}/lecrapaud/__init__.py +0 -0
- {lecrapaud-0.21.0 → lecrapaud-0.21.1}/lecrapaud/api.py +0 -0
- {lecrapaud-0.21.0 → lecrapaud-0.21.1}/lecrapaud/config.py +0 -0
- {lecrapaud-0.21.0 → lecrapaud-0.21.1}/lecrapaud/db/__init__.py +0 -0
- {lecrapaud-0.21.0 → lecrapaud-0.21.1}/lecrapaud/db/alembic/README +0 -0
- {lecrapaud-0.21.0 → lecrapaud-0.21.1}/lecrapaud/db/alembic/env.py +0 -0
- {lecrapaud-0.21.0 → lecrapaud-0.21.1}/lecrapaud/db/alembic/script.py.mako +0 -0
- {lecrapaud-0.21.0 → lecrapaud-0.21.1}/lecrapaud/db/alembic/versions/2025_06_23_1748-f089dfb7e3ba_.py +0 -0
- {lecrapaud-0.21.0 → lecrapaud-0.21.1}/lecrapaud/db/alembic/versions/2025_06_24_1216-c62251b129ed_.py +0 -0
- {lecrapaud-0.21.0 → lecrapaud-0.21.1}/lecrapaud/db/alembic/versions/2025_06_24_1711-86457e2f333f_.py +0 -0
- {lecrapaud-0.21.0 → lecrapaud-0.21.1}/lecrapaud/db/alembic/versions/2025_06_25_1759-72aa496ca65b_.py +0 -0
- {lecrapaud-0.21.0 → lecrapaud-0.21.1}/lecrapaud/db/alembic/versions/2025_08_25_1434-7ed9963e732f_add_best_score_to_model_selection.py +0 -0
- {lecrapaud-0.21.0 → lecrapaud-0.21.1}/lecrapaud/db/alembic/versions/2025_08_28_1516-c36e9fee22b9_add_avg_precision_to_score.py +0 -0
- {lecrapaud-0.21.0 → lecrapaud-0.21.1}/lecrapaud/db/alembic/versions/2025_08_28_1622-8b11c1ba982e_change_name_column.py +0 -0
- {lecrapaud-0.21.0 → lecrapaud-0.21.1}/lecrapaud/db/alembic/versions/2025_10_25_0635-07e303521594_add_unique_constraint_to_score.py +0 -0
- {lecrapaud-0.21.0 → lecrapaud-0.21.1}/lecrapaud/db/alembic/versions/2025_10_26_1727-033e0f7eca4f_merge_score_and_model_trainings_into_.py +0 -0
- {lecrapaud-0.21.0 → lecrapaud-0.21.1}/lecrapaud/db/alembic/versions/2025_10_28_2006-0a8fb7826e9b_add_number_of_targets_and_remove_other_.py +0 -0
- {lecrapaud-0.21.0 → lecrapaud-0.21.1}/lecrapaud/db/alembic.ini +0 -0
- {lecrapaud-0.21.0 → lecrapaud-0.21.1}/lecrapaud/db/models/__init__.py +0 -0
- {lecrapaud-0.21.0 → lecrapaud-0.21.1}/lecrapaud/db/models/base.py +0 -0
- {lecrapaud-0.21.0 → lecrapaud-0.21.1}/lecrapaud/db/models/experiment.py +0 -0
- {lecrapaud-0.21.0 → lecrapaud-0.21.1}/lecrapaud/db/models/feature.py +0 -0
- {lecrapaud-0.21.0 → lecrapaud-0.21.1}/lecrapaud/db/models/feature_selection.py +0 -0
- {lecrapaud-0.21.0 → lecrapaud-0.21.1}/lecrapaud/db/models/feature_selection_rank.py +0 -0
- {lecrapaud-0.21.0 → lecrapaud-0.21.1}/lecrapaud/db/models/model.py +0 -0
- {lecrapaud-0.21.0 → lecrapaud-0.21.1}/lecrapaud/db/models/model_selection.py +0 -0
- {lecrapaud-0.21.0 → lecrapaud-0.21.1}/lecrapaud/db/models/model_selection_score.py +0 -0
- {lecrapaud-0.21.0 → lecrapaud-0.21.1}/lecrapaud/db/models/target.py +0 -0
- {lecrapaud-0.21.0 → lecrapaud-0.21.1}/lecrapaud/db/models/utils.py +0 -0
- {lecrapaud-0.21.0 → lecrapaud-0.21.1}/lecrapaud/db/session.py +0 -0
- {lecrapaud-0.21.0 → lecrapaud-0.21.1}/lecrapaud/directories.py +0 -0
- {lecrapaud-0.21.0 → lecrapaud-0.21.1}/lecrapaud/experiment.py +0 -0
- {lecrapaud-0.21.0 → lecrapaud-0.21.1}/lecrapaud/feature_selection.py +0 -0
- {lecrapaud-0.21.0 → lecrapaud-0.21.1}/lecrapaud/integrations/openai_integration.py +0 -0
- {lecrapaud-0.21.0 → lecrapaud-0.21.1}/lecrapaud/jobs/__init__.py +0 -0
- {lecrapaud-0.21.0 → lecrapaud-0.21.1}/lecrapaud/jobs/config.py +0 -0
- {lecrapaud-0.21.0 → lecrapaud-0.21.1}/lecrapaud/jobs/scheduler.py +0 -0
- {lecrapaud-0.21.0 → lecrapaud-0.21.1}/lecrapaud/jobs/tasks.py +0 -0
- {lecrapaud-0.21.0 → lecrapaud-0.21.1}/lecrapaud/misc/tabpfn_tests.ipynb +0 -0
- {lecrapaud-0.21.0 → lecrapaud-0.21.1}/lecrapaud/misc/test-gpu-bilstm.ipynb +0 -0
- {lecrapaud-0.21.0 → lecrapaud-0.21.1}/lecrapaud/misc/test-gpu-resnet.ipynb +0 -0
- {lecrapaud-0.21.0 → lecrapaud-0.21.1}/lecrapaud/misc/test-gpu-transformers.ipynb +0 -0
- {lecrapaud-0.21.0 → lecrapaud-0.21.1}/lecrapaud/model_selection.py +0 -0
- {lecrapaud-0.21.0 → lecrapaud-0.21.1}/lecrapaud/search_space.py +0 -0
- {lecrapaud-0.21.0 → lecrapaud-0.21.1}/lecrapaud/utils.py +0 -0
|
@@ -605,7 +605,7 @@ class PreprocessFeature:
|
|
|
605
605
|
|
|
606
606
|
return df, pcas_dict
|
|
607
607
|
|
|
608
|
-
def
|
|
608
|
+
def add_pca_feature_cross_sectional_old(
|
|
609
609
|
self,
|
|
610
610
|
df: pd.DataFrame,
|
|
611
611
|
*,
|
|
@@ -657,7 +657,7 @@ class PreprocessFeature:
|
|
|
657
657
|
|
|
658
658
|
return df, pcas_dict
|
|
659
659
|
|
|
660
|
-
def
|
|
660
|
+
def add_pca_feature_cross_sectional(
|
|
661
661
|
self,
|
|
662
662
|
df: pd.DataFrame,
|
|
663
663
|
*,
|
|
@@ -840,6 +840,11 @@ class PreprocessFeature:
|
|
|
840
840
|
# Merger les scores
|
|
841
841
|
df = df.merge(scores_df, on=index_col, how="left")
|
|
842
842
|
df.index = index_saved
|
|
843
|
+
|
|
844
|
+
# Forward fill puis 0 pour éviter les NaN
|
|
845
|
+
pca_cols = [col for col in df.columns if col.startswith(prefix)]
|
|
846
|
+
df[pca_cols] = df[pca_cols].fillna(method='ffill').fillna(0)
|
|
847
|
+
|
|
843
848
|
pcas_dict.update({name: pipe})
|
|
844
849
|
|
|
845
850
|
else:
|
|
@@ -873,7 +878,7 @@ class PreprocessFeature:
|
|
|
873
878
|
return df, pcas_dict
|
|
874
879
|
|
|
875
880
|
# ----------------- 2) PCA TEMPORELLE (liste de colonnes lags) ----------------
|
|
876
|
-
def
|
|
881
|
+
def add_pca_feature_temporal_old(
|
|
877
882
|
self,
|
|
878
883
|
df: pd.DataFrame,
|
|
879
884
|
*,
|
|
@@ -936,6 +941,187 @@ class PreprocessFeature:
|
|
|
936
941
|
|
|
937
942
|
return df, pcas_dict
|
|
938
943
|
|
|
944
|
+
def add_pca_feature_temporal(
|
|
945
|
+
self,
|
|
946
|
+
df: pd.DataFrame,
|
|
947
|
+
*,
|
|
948
|
+
n_components: int = 5,
|
|
949
|
+
pcas: dict[str, Pipeline] | None = None,
|
|
950
|
+
impute_strategy: str = "median",
|
|
951
|
+
standardize: bool = True,
|
|
952
|
+
lookback_days: int = 365,
|
|
953
|
+
refresh_frequency: int = 90,
|
|
954
|
+
) -> tuple[pd.DataFrame, dict[str, Pipeline]]:
|
|
955
|
+
"""
|
|
956
|
+
PCA temporelle pour time series avec support panel data.
|
|
957
|
+
Crée automatiquement les colonnes de lags et évite le look-ahead bias.
|
|
958
|
+
|
|
959
|
+
Format pca_temporal simplifié:
|
|
960
|
+
[{"name": "LAST_20_RET", "column": "RET", "lags": 20}]
|
|
961
|
+
"""
|
|
962
|
+
pcas_dict = {}
|
|
963
|
+
|
|
964
|
+
for pca_config in self.pca_temporal:
|
|
965
|
+
# Support both old and new format
|
|
966
|
+
if "columns" in pca_config:
|
|
967
|
+
# Old format: use existing columns
|
|
968
|
+
name = pca_config["name"]
|
|
969
|
+
lag_columns = pca_config["columns"]
|
|
970
|
+
base_column = None
|
|
971
|
+
num_lags = len(lag_columns)
|
|
972
|
+
else:
|
|
973
|
+
# New format: create lag columns
|
|
974
|
+
name = pca_config["name"]
|
|
975
|
+
base_column = pca_config["column"].upper()
|
|
976
|
+
num_lags = pca_config.get("lags", 20)
|
|
977
|
+
|
|
978
|
+
# Create lag columns if they don't exist
|
|
979
|
+
if self.group_column:
|
|
980
|
+
# Panel data: create lags by group
|
|
981
|
+
for lag in range(1, num_lags + 1):
|
|
982
|
+
lag_col = f"{base_column}_-{lag}"
|
|
983
|
+
if lag_col not in df.columns:
|
|
984
|
+
df[lag_col] = df.groupby(self.group_column)[base_column].shift(lag)
|
|
985
|
+
else:
|
|
986
|
+
# Simple time series
|
|
987
|
+
for lag in range(1, num_lags + 1):
|
|
988
|
+
lag_col = f"{base_column}_-{lag}"
|
|
989
|
+
if lag_col not in df.columns:
|
|
990
|
+
df[lag_col] = df[base_column].shift(lag)
|
|
991
|
+
|
|
992
|
+
lag_columns = [f"{base_column}_-{i}" for i in range(1, num_lags + 1)]
|
|
993
|
+
|
|
994
|
+
prefix = f"TMP_PC_{name}"
|
|
995
|
+
|
|
996
|
+
# For time series: avoid look-ahead bias
|
|
997
|
+
if self.time_series and self.date_column:
|
|
998
|
+
all_scores = []
|
|
999
|
+
unique_dates = sorted(df[self.date_column].unique())
|
|
1000
|
+
|
|
1001
|
+
if pcas is not None:
|
|
1002
|
+
# Inference: use provided PCA
|
|
1003
|
+
pipe = pcas[name]
|
|
1004
|
+
|
|
1005
|
+
# Apply to all data at once
|
|
1006
|
+
mask = df[lag_columns].notna().all(axis=1)
|
|
1007
|
+
if mask.any():
|
|
1008
|
+
X_transform = df.loc[mask, lag_columns]
|
|
1009
|
+
scores = pipe.transform(X_transform)
|
|
1010
|
+
|
|
1011
|
+
for i in range(n_components):
|
|
1012
|
+
df.loc[mask, f"{prefix}_{i}"] = scores[:, i]
|
|
1013
|
+
|
|
1014
|
+
# Fill NaN with forward fill then 0
|
|
1015
|
+
pca_cols = [f"{prefix}_{i}" for i in range(n_components)]
|
|
1016
|
+
df[pca_cols] = df[pca_cols].fillna(method='ffill').fillna(0)
|
|
1017
|
+
|
|
1018
|
+
else:
|
|
1019
|
+
# Training: expanding window with periodic refresh
|
|
1020
|
+
pipe = None
|
|
1021
|
+
last_fit_date = None
|
|
1022
|
+
|
|
1023
|
+
for current_date_ordinal in unique_dates:
|
|
1024
|
+
current_date = pd.Timestamp.fromordinal(int(current_date_ordinal))
|
|
1025
|
+
|
|
1026
|
+
# Determine if we should refit
|
|
1027
|
+
should_refit = pipe is None or (
|
|
1028
|
+
last_fit_date is not None
|
|
1029
|
+
and (current_date - last_fit_date).days >= refresh_frequency
|
|
1030
|
+
)
|
|
1031
|
+
|
|
1032
|
+
if should_refit and len(df[df[self.date_column] < current_date_ordinal]) > num_lags * 2:
|
|
1033
|
+
# Get historical data for fitting
|
|
1034
|
+
lookback_start = current_date - pd.Timedelta(days=lookback_days)
|
|
1035
|
+
lookback_start_ordinal = pd.Timestamp.toordinal(lookback_start)
|
|
1036
|
+
|
|
1037
|
+
mask_fit = (
|
|
1038
|
+
(df[self.date_column] >= lookback_start_ordinal) &
|
|
1039
|
+
(df[self.date_column] < current_date_ordinal) &
|
|
1040
|
+
df[lag_columns].notna().all(axis=1)
|
|
1041
|
+
)
|
|
1042
|
+
|
|
1043
|
+
if mask_fit.sum() >= n_components:
|
|
1044
|
+
X_fit = df.loc[mask_fit, lag_columns]
|
|
1045
|
+
|
|
1046
|
+
# Create pipeline
|
|
1047
|
+
steps = []
|
|
1048
|
+
if impute_strategy is not None:
|
|
1049
|
+
steps.append(("imputer", SimpleImputer(strategy=impute_strategy)))
|
|
1050
|
+
if standardize:
|
|
1051
|
+
steps.append(("scaler", StandardScaler()))
|
|
1052
|
+
steps.append(("pca", PCA(n_components=n_components, random_state=0)))
|
|
1053
|
+
|
|
1054
|
+
pipe = Pipeline(steps)
|
|
1055
|
+
pipe.fit(X_fit)
|
|
1056
|
+
last_fit_date = current_date
|
|
1057
|
+
|
|
1058
|
+
logger.debug(
|
|
1059
|
+
f"Temporal PCA {name} refitted at {current_date.strftime('%Y-%m-%d')} "
|
|
1060
|
+
f"using {len(X_fit)} samples"
|
|
1061
|
+
)
|
|
1062
|
+
|
|
1063
|
+
# Transform current date data
|
|
1064
|
+
if pipe is not None:
|
|
1065
|
+
mask_current = (
|
|
1066
|
+
(df[self.date_column] == current_date_ordinal) &
|
|
1067
|
+
df[lag_columns].notna().all(axis=1)
|
|
1068
|
+
)
|
|
1069
|
+
|
|
1070
|
+
if mask_current.any():
|
|
1071
|
+
X_current = df.loc[mask_current, lag_columns]
|
|
1072
|
+
scores = pipe.transform(X_current)
|
|
1073
|
+
|
|
1074
|
+
for i in range(n_components):
|
|
1075
|
+
df.loc[mask_current, f"{prefix}_{i}"] = scores[:, i]
|
|
1076
|
+
|
|
1077
|
+
# Fill NaN with forward fill then 0
|
|
1078
|
+
pca_cols = [f"{prefix}_{i}" for i in range(n_components)]
|
|
1079
|
+
for col in pca_cols:
|
|
1080
|
+
if col not in df.columns:
|
|
1081
|
+
df[col] = 0
|
|
1082
|
+
df[pca_cols] = df[pca_cols].fillna(method='ffill').fillna(0)
|
|
1083
|
+
|
|
1084
|
+
pcas_dict[name] = pipe
|
|
1085
|
+
|
|
1086
|
+
else:
|
|
1087
|
+
# Non time-series: use original approach
|
|
1088
|
+
mask = df[lag_columns].notna().all(axis=1)
|
|
1089
|
+
|
|
1090
|
+
if pcas is None and mask.any():
|
|
1091
|
+
X_fit = df.loc[mask, lag_columns]
|
|
1092
|
+
|
|
1093
|
+
steps = []
|
|
1094
|
+
if impute_strategy is not None:
|
|
1095
|
+
steps.append(("imputer", SimpleImputer(strategy=impute_strategy)))
|
|
1096
|
+
if standardize:
|
|
1097
|
+
steps.append(("scaler", StandardScaler()))
|
|
1098
|
+
steps.append(("pca", PCA(n_components=n_components, random_state=0)))
|
|
1099
|
+
|
|
1100
|
+
pipe = Pipeline(steps)
|
|
1101
|
+
pipe.fit(X_fit)
|
|
1102
|
+
pcas_dict[name] = pipe
|
|
1103
|
+
elif pcas is not None:
|
|
1104
|
+
pipe = pcas[name]
|
|
1105
|
+
pcas_dict[name] = pipe
|
|
1106
|
+
else:
|
|
1107
|
+
continue
|
|
1108
|
+
|
|
1109
|
+
if mask.any():
|
|
1110
|
+
X_transform = df.loc[mask, lag_columns]
|
|
1111
|
+
scores = pipe.transform(X_transform)
|
|
1112
|
+
|
|
1113
|
+
for i in range(n_components):
|
|
1114
|
+
df.loc[mask, f"{prefix}_{i}"] = scores[:, i]
|
|
1115
|
+
|
|
1116
|
+
# Fill missing values
|
|
1117
|
+
pca_cols = [f"{prefix}_{i}" for i in range(n_components)]
|
|
1118
|
+
for col in pca_cols:
|
|
1119
|
+
if col not in df.columns:
|
|
1120
|
+
df[col] = 0
|
|
1121
|
+
df[pca_cols] = df[pca_cols].fillna(0)
|
|
1122
|
+
|
|
1123
|
+
return df, pcas_dict
|
|
1124
|
+
|
|
939
1125
|
# encoding categorical features
|
|
940
1126
|
def encode_categorical_features(
|
|
941
1127
|
self,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{lecrapaud-0.21.0 → lecrapaud-0.21.1}/lecrapaud/db/alembic/versions/2025_06_23_1748-f089dfb7e3ba_.py
RENAMED
|
File without changes
|
{lecrapaud-0.21.0 → lecrapaud-0.21.1}/lecrapaud/db/alembic/versions/2025_06_24_1216-c62251b129ed_.py
RENAMED
|
File without changes
|
{lecrapaud-0.21.0 → lecrapaud-0.21.1}/lecrapaud/db/alembic/versions/2025_06_24_1711-86457e2f333f_.py
RENAMED
|
File without changes
|
{lecrapaud-0.21.0 → lecrapaud-0.21.1}/lecrapaud/db/alembic/versions/2025_06_25_1759-72aa496ca65b_.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|