upgini 1.2.86a2__py3-none-any.whl → 1.2.87__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- upgini/__about__.py +1 -1
- upgini/data_source/data_source_publisher.py +21 -0
- upgini/features_enricher.py +91 -41
- upgini/metrics.py +103 -41
- upgini/resource_bundle/strings.properties +3 -1
- upgini/utils/datetime_utils.py +130 -118
- upgini/utils/deduplicate_utils.py +4 -4
- upgini/utils/sklearn_ext.py +112 -8
- {upgini-1.2.86a2.dist-info → upgini-1.2.87.dist-info}/METADATA +1 -1
- {upgini-1.2.86a2.dist-info → upgini-1.2.87.dist-info}/RECORD +12 -12
- {upgini-1.2.86a2.dist-info → upgini-1.2.87.dist-info}/WHEEL +0 -0
- {upgini-1.2.86a2.dist-info → upgini-1.2.87.dist-info}/licenses/LICENSE +0 -0
upgini/metrics.py
CHANGED
@@ -6,16 +6,26 @@ import re
|
|
6
6
|
from collections import defaultdict
|
7
7
|
from copy import deepcopy
|
8
8
|
from dataclasses import dataclass
|
9
|
-
from typing import
|
9
|
+
from typing import (
|
10
|
+
Any,
|
11
|
+
Callable,
|
12
|
+
Dict,
|
13
|
+
List,
|
14
|
+
Literal,
|
15
|
+
Optional,
|
16
|
+
Protocol,
|
17
|
+
Tuple,
|
18
|
+
Union,
|
19
|
+
runtime_checkable,
|
20
|
+
)
|
10
21
|
|
11
22
|
import lightgbm as lgb
|
12
23
|
import numpy as np
|
13
24
|
import pandas as pd
|
14
25
|
from catboost import CatBoostClassifier, CatBoostRegressor
|
15
|
-
from category_encoders.cat_boost import CatBoostEncoder
|
16
26
|
from lightgbm import LGBMClassifier, LGBMRegressor
|
17
27
|
from numpy import log1p
|
18
|
-
from pandas.api.types import
|
28
|
+
from pandas.api.types import is_float_dtype, is_integer_dtype, is_numeric_dtype
|
19
29
|
from sklearn.metrics import check_scoring, get_scorer, make_scorer, roc_auc_score
|
20
30
|
|
21
31
|
from upgini.utils.blocked_time_series import BlockedTimeSeriesSplit
|
@@ -32,10 +42,7 @@ except ImportError:
|
|
32
42
|
available_scorers = SCORERS
|
33
43
|
from sklearn.metrics import mean_squared_error
|
34
44
|
from sklearn.metrics._regression import _check_reg_targets, check_consistent_length
|
35
|
-
from sklearn.model_selection import
|
36
|
-
BaseCrossValidator,
|
37
|
-
TimeSeriesSplit,
|
38
|
-
)
|
45
|
+
from sklearn.model_selection import BaseCrossValidator, TimeSeriesSplit
|
39
46
|
|
40
47
|
from upgini.errors import ValidationError
|
41
48
|
from upgini.metadata import ModelTaskType
|
@@ -57,6 +64,16 @@ CATBOOST_REGRESSION_PARAMS = {
|
|
57
64
|
"allow_writing_files": False,
|
58
65
|
}
|
59
66
|
|
67
|
+
CATBOOST_TS_PARAMS = {
|
68
|
+
"learning_rate": 0.05,
|
69
|
+
"early_stopping_rounds": 20,
|
70
|
+
"use_best_model": True,
|
71
|
+
"one_hot_max_size": 100,
|
72
|
+
"verbose": False,
|
73
|
+
"random_state": 42,
|
74
|
+
"allow_writing_files": False,
|
75
|
+
}
|
76
|
+
|
60
77
|
CATBOOST_BINARY_PARAMS = {
|
61
78
|
"iterations": 250,
|
62
79
|
"learning_rate": 0.05,
|
@@ -311,6 +328,7 @@ class EstimatorWrapper:
|
|
311
328
|
self.target_type = target_type
|
312
329
|
self.add_params = add_params
|
313
330
|
self.cv_estimators = None
|
331
|
+
self.cv_cat_encoders: Optional[List[Optional[HasTransform]]] = None
|
314
332
|
self.groups = groups
|
315
333
|
self.text_features = text_features
|
316
334
|
self.logger = logger or logging.getLogger()
|
@@ -391,9 +409,7 @@ class EstimatorWrapper:
|
|
391
409
|
self.converted_to_int.append(c)
|
392
410
|
self.cat_features.remove(c)
|
393
411
|
elif is_float_dtype(x[c]) or (x[c].dtype == "category" and is_float_dtype(x[c].cat.categories)):
|
394
|
-
self.logger.info(
|
395
|
-
f"Convert float cat feature {c} to string"
|
396
|
-
)
|
412
|
+
self.logger.info(f"Convert float cat feature {c} to string")
|
397
413
|
x[c] = x[c].astype(str)
|
398
414
|
self.converted_to_str.append(c)
|
399
415
|
elif x[c].dtype not in ["category", "int64"]:
|
@@ -439,7 +455,9 @@ class EstimatorWrapper:
|
|
439
455
|
|
440
456
|
return x, y, {}
|
441
457
|
|
442
|
-
def calculate_shap(
|
458
|
+
def calculate_shap(
|
459
|
+
self, x: pd.DataFrame, y: pd.Series, estimator, cat_encoder: Optional[HasTransform]
|
460
|
+
) -> Optional[Dict[str, float]]:
|
443
461
|
return None
|
444
462
|
|
445
463
|
def cross_val_predict(
|
@@ -470,9 +488,11 @@ class EstimatorWrapper:
|
|
470
488
|
fit_params=fit_params,
|
471
489
|
return_estimator=True,
|
472
490
|
error_score="raise",
|
491
|
+
random_state=DEFAULT_RANDOM_STATE,
|
473
492
|
)
|
474
493
|
metrics_by_fold = cv_results["test_score"]
|
475
494
|
self.cv_estimators = cv_results["estimator"]
|
495
|
+
self.cv_cat_encoders = cv_results["cat_encoder"]
|
476
496
|
|
477
497
|
self.check_fold_metrics(metrics_by_fold)
|
478
498
|
|
@@ -480,14 +500,14 @@ class EstimatorWrapper:
|
|
480
500
|
|
481
501
|
splits = self.cv.split(x, y, groups)
|
482
502
|
|
483
|
-
for estimator, split in zip(self.cv_estimators, splits):
|
503
|
+
for estimator, cat_encoder, split in zip(self.cv_estimators, self.cv_cat_encoders, splits):
|
484
504
|
_, validation_idx = split
|
485
505
|
cv_x = x.iloc[validation_idx]
|
486
506
|
if isinstance(y, pd.Series):
|
487
507
|
cv_y = y.iloc[validation_idx]
|
488
508
|
else:
|
489
509
|
cv_y = y[validation_idx]
|
490
|
-
shaps = self.calculate_shap(cv_x, cv_y, estimator)
|
510
|
+
shaps = self.calculate_shap(cv_x, cv_y, estimator, cat_encoder)
|
491
511
|
if shaps is not None:
|
492
512
|
for feature, shap_value in shaps.items():
|
493
513
|
shap_values_all_folds[feature].append(shap_value)
|
@@ -527,8 +547,19 @@ class EstimatorWrapper:
|
|
527
547
|
metric, metric_std = roc_auc_score(y, x[baseline_score_column]), None
|
528
548
|
else:
|
529
549
|
metrics = []
|
530
|
-
for est in self.cv_estimators:
|
531
|
-
|
550
|
+
for est, cat_encoder in zip(self.cv_estimators, self.cv_cat_encoders):
|
551
|
+
x_copy = x.copy()
|
552
|
+
if cat_encoder is not None:
|
553
|
+
if hasattr(cat_encoder, "feature_names_in_"):
|
554
|
+
encoded = cat_encoder.transform(x_copy[cat_encoder.feature_names_in_])
|
555
|
+
else:
|
556
|
+
encoded = cat_encoder.transform(x[self.cat_features])
|
557
|
+
if isinstance(self.cv, TimeSeriesSplit) or isinstance(self.cv, BlockedTimeSeriesSplit):
|
558
|
+
encoded = encoded.astype(int)
|
559
|
+
else:
|
560
|
+
encoded = encoded.astype("category")
|
561
|
+
x_copy[self.cat_features] = encoded
|
562
|
+
metrics.append(self.scorer(est, x_copy, y))
|
532
563
|
|
533
564
|
metric, metric_std = self._calculate_metric_from_folds(metrics)
|
534
565
|
return _CrossValResults(metric=metric, metric_std=metric_std, shap_values=None)
|
@@ -551,7 +582,7 @@ class EstimatorWrapper:
|
|
551
582
|
text_features: Optional[List[str]] = None,
|
552
583
|
add_params: Optional[Dict[str, Any]] = None,
|
553
584
|
groups: Optional[List[str]] = None,
|
554
|
-
|
585
|
+
has_time: bool = False,
|
555
586
|
) -> EstimatorWrapper:
|
556
587
|
scorer, metric_name, multiplier = define_scorer(target_type, scoring)
|
557
588
|
kwargs = {
|
@@ -568,7 +599,7 @@ class EstimatorWrapper:
|
|
568
599
|
if estimator is None:
|
569
600
|
if EstimatorWrapper.default_estimator == "catboost":
|
570
601
|
logger.info("Using CatBoost as default estimator")
|
571
|
-
params = {"has_time":
|
602
|
+
params = {"has_time": has_time}
|
572
603
|
if target_type == ModelTaskType.MULTICLASS:
|
573
604
|
params = _get_add_params(params, CATBOOST_MULTICLASS_PARAMS)
|
574
605
|
params = _get_add_params(params, add_params)
|
@@ -578,7 +609,10 @@ class EstimatorWrapper:
|
|
578
609
|
params = _get_add_params(params, add_params)
|
579
610
|
estimator = CatBoostWrapper(CatBoostClassifier(**params), **kwargs)
|
580
611
|
elif target_type == ModelTaskType.REGRESSION:
|
581
|
-
|
612
|
+
if not isinstance(cv, TimeSeriesSplit) and not isinstance(cv, BlockedTimeSeriesSplit):
|
613
|
+
params = _get_add_params(params, CATBOOST_TS_PARAMS)
|
614
|
+
else:
|
615
|
+
params = _get_add_params(params, CATBOOST_REGRESSION_PARAMS)
|
582
616
|
params = _get_add_params(params, add_params)
|
583
617
|
estimator = CatBoostWrapper(CatBoostRegressor(**params), **kwargs)
|
584
618
|
else:
|
@@ -610,8 +644,8 @@ class EstimatorWrapper:
|
|
610
644
|
estimator_copy = deepcopy(estimator)
|
611
645
|
kwargs["estimator"] = estimator_copy
|
612
646
|
if is_catboost_estimator(estimator):
|
613
|
-
if
|
614
|
-
estimator_copy.set_params(has_time=
|
647
|
+
if has_time is not None:
|
648
|
+
estimator_copy.set_params(has_time=has_time)
|
615
649
|
estimator = CatBoostWrapper(**kwargs)
|
616
650
|
else:
|
617
651
|
if isinstance(estimator, (LGBMClassifier, LGBMRegressor)):
|
@@ -769,15 +803,26 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
769
803
|
else:
|
770
804
|
raise e
|
771
805
|
|
772
|
-
def calculate_shap(self, x: pd.DataFrame, y: pd.Series, estimator) -> Optional[Dict[str, float]]:
|
806
|
+
def calculate_shap(self, x: pd.DataFrame, y: pd.Series, estimator, cat_encoder) -> Optional[Dict[str, float]]:
|
773
807
|
try:
|
774
808
|
from catboost import Pool
|
775
809
|
|
810
|
+
cat_features = None
|
811
|
+
if cat_encoder is not None:
|
812
|
+
if isinstance(self.cv, TimeSeriesSplit) or isinstance(self.cv, BlockedTimeSeriesSplit):
|
813
|
+
encoded = cat_encoder.transform(x[self.cat_features]).astype(int)
|
814
|
+
else:
|
815
|
+
encoded = cat_encoder.transform(x[self.cat_features])
|
816
|
+
cat_features = encoded.columns.to_list()
|
817
|
+
x[self.cat_features] = encoded
|
818
|
+
else:
|
819
|
+
cat_features = self.cat_features
|
820
|
+
|
776
821
|
# Create Pool for fold data, if need (for example, when categorical features are present)
|
777
822
|
fold_pool = Pool(
|
778
823
|
x,
|
779
824
|
y,
|
780
|
-
cat_features=
|
825
|
+
cat_features=cat_features,
|
781
826
|
text_features=self.text_features,
|
782
827
|
embedding_features=self.grouped_embedding_features,
|
783
828
|
)
|
@@ -834,7 +879,6 @@ class LightGBMWrapper(EstimatorWrapper):
|
|
834
879
|
text_features=text_features,
|
835
880
|
logger=logger,
|
836
881
|
)
|
837
|
-
self.cat_encoder = None
|
838
882
|
self.n_classes = None
|
839
883
|
|
840
884
|
def _prepare_to_fit(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, pd.Series, np.ndarray, dict]:
|
@@ -846,10 +890,10 @@ class LightGBMWrapper(EstimatorWrapper):
|
|
846
890
|
params["eval_metric"] = "auc"
|
847
891
|
params["callbacks"] = [lgb.early_stopping(stopping_rounds=LIGHTGBM_EARLY_STOPPING_ROUNDS, verbose=False)]
|
848
892
|
if self.cat_features:
|
849
|
-
|
850
|
-
|
851
|
-
|
852
|
-
|
893
|
+
for c in self.cat_features:
|
894
|
+
if x[c].dtype != "category":
|
895
|
+
x[c] = x[c].astype("category")
|
896
|
+
|
853
897
|
for c in x.columns:
|
854
898
|
if x[c].dtype not in ["category", "int64", "float64", "bool"]:
|
855
899
|
self.logger.warning(f"Feature {c} is not numeric and will be dropped")
|
@@ -859,15 +903,26 @@ class LightGBMWrapper(EstimatorWrapper):
|
|
859
903
|
|
860
904
|
def _prepare_to_calculate(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
|
861
905
|
x, y_numpy, params = super()._prepare_to_calculate(x, y)
|
862
|
-
if self.cat_features
|
863
|
-
|
864
|
-
|
906
|
+
if self.cat_features:
|
907
|
+
for c in self.cat_features:
|
908
|
+
if x[c].dtype != "category":
|
909
|
+
x[c] = x[c].astype("category")
|
865
910
|
return x, y_numpy, params
|
866
911
|
|
867
|
-
def calculate_shap(
|
912
|
+
def calculate_shap(
|
913
|
+
self, x: pd.DataFrame, y: pd.Series, estimator, cat_encoder: Optional[HasTransform]
|
914
|
+
) -> Optional[Dict[str, float]]:
|
868
915
|
try:
|
916
|
+
x_copy = x.copy()
|
917
|
+
if cat_encoder is not None:
|
918
|
+
if isinstance(self.cv, TimeSeriesSplit) or isinstance(self.cv, BlockedTimeSeriesSplit):
|
919
|
+
encoded = cat_encoder.transform(x_copy[self.cat_features]).astype(int)
|
920
|
+
else:
|
921
|
+
encoded = cat_encoder.transform(x_copy[self.cat_features]).astype("category")
|
922
|
+
x_copy[self.cat_features] = encoded
|
923
|
+
|
869
924
|
shap_matrix = estimator.predict(
|
870
|
-
|
925
|
+
x_copy,
|
871
926
|
predict_disable_shape_check=True,
|
872
927
|
raw_score=True,
|
873
928
|
pred_leaf=False,
|
@@ -926,10 +981,10 @@ class OtherEstimatorWrapper(EstimatorWrapper):
|
|
926
981
|
num_features = [col for col in x.columns if col not in self.cat_features]
|
927
982
|
x[num_features] = x[num_features].fillna(-999)
|
928
983
|
if self.cat_features:
|
929
|
-
|
930
|
-
|
931
|
-
|
932
|
-
|
984
|
+
for c in self.cat_features:
|
985
|
+
if x[c].dtype != "category":
|
986
|
+
x[c] = x[c].astype("category")
|
987
|
+
params["cat_features"] = self.cat_features
|
933
988
|
for c in x.columns:
|
934
989
|
if x[c].dtype not in ["category", "int64", "float64", "bool"]:
|
935
990
|
self.logger.warning(f"Feature {c} is not numeric and will be dropped")
|
@@ -940,15 +995,22 @@ class OtherEstimatorWrapper(EstimatorWrapper):
|
|
940
995
|
def _prepare_to_calculate(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
|
941
996
|
x, y_numpy, params = super()._prepare_to_calculate(x, y)
|
942
997
|
if self.cat_features is not None:
|
998
|
+
for c in self.cat_features:
|
999
|
+
if x[c].dtype != "category":
|
1000
|
+
x[c] = x[c].astype("category")
|
943
1001
|
num_features = [col for col in x.columns if col not in self.cat_features]
|
944
|
-
|
945
|
-
|
946
|
-
|
947
|
-
|
948
|
-
).astype("category")
|
1002
|
+
else:
|
1003
|
+
num_features = x.columns
|
1004
|
+
x[num_features] = x[num_features].fillna(-999)
|
1005
|
+
|
949
1006
|
return x, y_numpy, params
|
950
1007
|
|
951
1008
|
|
1009
|
+
@runtime_checkable
|
1010
|
+
class HasTransform(Protocol):
|
1011
|
+
def transform(self, X: pd.DataFrame, y: Optional[Union[pd.Series, np.ndarray]] = None) -> pd.DataFrame: ...
|
1012
|
+
|
1013
|
+
|
952
1014
|
def validate_scoring_argument(scoring: Union[Callable, str, None]):
|
953
1015
|
if scoring is None:
|
954
1016
|
return
|
@@ -68,6 +68,8 @@ too_many_generate_features=Too many columns passed in `generate_features` argume
|
|
68
68
|
invalid_round_embeddings=Argument `round_embeddings` should be non negative integer
|
69
69
|
no_important_features_for_transform=There are no important features for transform. Return input as transformed
|
70
70
|
search_task_not_initial=Passed search_id {} is transform id. Please use search task id of fit call: {}.
|
71
|
+
binary_target_unique_count_not_2=Binary target should contain only 2 unique values, but {} found
|
72
|
+
binary_target_eval_unique_count_not_2=Binary target should contain only 2 unique values, but {} found in eval_set
|
71
73
|
|
72
74
|
# Validation errors
|
73
75
|
# params validation
|
@@ -156,7 +158,7 @@ dataset_too_few_rows=X size should be at least {} rows after validation
|
|
156
158
|
dataset_too_many_rows_registered=X rows limit for transform is {}. Please sample X
|
157
159
|
dataset_empty_column_names=Some column names are empty. Add names please
|
158
160
|
dataset_full_duplicates={:.5f}% of the rows are fully duplicated
|
159
|
-
dataset_diff_target_duplicates={:.4f}% of rows ({}) in X and eval_set are duplicates with different y values. These rows will be deleted as incorrect\
|
161
|
+
dataset_diff_target_duplicates={:.4f}% of rows ({}) in X and eval_set are duplicates with different y values. These rows will be deleted as incorrect\nSample of incorrect row indexes: {}
|
160
162
|
dataset_train_diff_target_duplicates_fintech={:.4f}% of rows ({}) in X are duplicates, not taking into consideration dates, IP addresses and features from the training set, but have different y values. These rows have been removed to optimize search results.\nRemoved row indexes: {}
|
161
163
|
dataset_eval_diff_target_duplicates_fintech={:.4f}% of rows ({}) in eval{} X are duplicates, not taking into consideration dates, IP addresses and features from the training set, but have different y values. These rows have been removed to optimize search results.\nRemoved row indexes: {}
|
162
164
|
dataset_drop_old_dates=We don't have data before '2000-01-01' and removed all earlier records from the search dataset
|
upgini/utils/datetime_utils.py
CHANGED
@@ -41,6 +41,7 @@ class DateTimeSearchKeyConverter:
|
|
41
41
|
date_format: Optional[str] = None,
|
42
42
|
logger: Optional[logging.Logger] = None,
|
43
43
|
bundle: Optional[ResourceBundle] = None,
|
44
|
+
generate_cyclical_features: bool = True,
|
44
45
|
):
|
45
46
|
self.date_column = date_column
|
46
47
|
self.date_format = date_format
|
@@ -51,6 +52,7 @@ class DateTimeSearchKeyConverter:
|
|
51
52
|
self.logger.setLevel("FATAL")
|
52
53
|
self.generated_features: List[str] = []
|
53
54
|
self.bundle = bundle or get_custom_bundle()
|
55
|
+
self.generate_cyclical_features = generate_cyclical_features
|
54
56
|
self.has_old_dates = False
|
55
57
|
|
56
58
|
@staticmethod
|
@@ -121,61 +123,63 @@ class DateTimeSearchKeyConverter:
|
|
121
123
|
df[cos_feature] = np.cos(2 * np.pi * df[column] / period)
|
122
124
|
self.generated_features.append(cos_feature)
|
123
125
|
|
124
|
-
|
126
|
+
if self.generate_cyclical_features:
|
125
127
|
|
126
|
-
|
127
|
-
df["quarter_start"] = df[self.date_column].dt.to_period("Q").dt.start_time
|
128
|
+
df["quarter"] = df[self.date_column].dt.quarter
|
128
129
|
|
129
|
-
|
130
|
-
|
130
|
+
# Calculate the start date of the quarter for each timestamp
|
131
|
+
df["quarter_start"] = df[self.date_column].dt.to_period("Q").dt.start_time
|
131
132
|
|
132
|
-
|
133
|
-
|
134
|
-
start = df["quarter_start"]
|
135
|
-
year = start.dt.year
|
136
|
-
month = start.dt.month
|
133
|
+
# Calculate the day in the quarter
|
134
|
+
df["day_in_quarter"] = (df[self.date_column] - df["quarter_start"]).dt.days + 1
|
137
135
|
|
138
|
-
|
139
|
-
|
136
|
+
# Vectorized calculation of days_in_quarter
|
137
|
+
quarter = df["quarter"]
|
138
|
+
start = df["quarter_start"]
|
139
|
+
year = start.dt.year
|
140
|
+
month = start.dt.month
|
140
141
|
|
141
|
-
|
142
|
-
|
142
|
+
quarter_end_year = np.where(quarter == 4, year + 1, year)
|
143
|
+
quarter_end_month = np.where(quarter == 4, 1, month + 3)
|
143
144
|
|
144
|
-
|
145
|
+
end = pd.to_datetime({"year": quarter_end_year, "month": quarter_end_month, "day": 1})
|
146
|
+
end.index = df.index
|
145
147
|
|
146
|
-
|
148
|
+
df["days_in_quarter"] = (end - start).dt.days
|
147
149
|
|
148
|
-
|
150
|
+
add_cyclical_features(df, "day_in_quarter", df["days_in_quarter"]) # Days in the quarter
|
149
151
|
|
150
|
-
|
152
|
+
df.drop(columns=["quarter", "quarter_start", "day_in_quarter", "days_in_quarter"], inplace=True)
|
151
153
|
|
152
|
-
|
153
|
-
if (seconds_without_na != 0).any() and seconds_without_na.nunique() > 1:
|
154
|
-
self.logger.info("Time found in date search key. Add extra features based on time")
|
154
|
+
df[seconds] = (df[self.date_column] - df[self.date_column].dt.floor("D")).dt.seconds
|
155
155
|
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
df["hour"] = df[self.date_column].dt.hour
|
156
|
+
seconds_without_na = df[seconds].dropna()
|
157
|
+
if (seconds_without_na != 0).any() and seconds_without_na.nunique() > 1:
|
158
|
+
self.logger.info("Time found in date search key. Add extra features based on time")
|
160
159
|
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
add_cyclical_features(df, "hour", 24) # Hours in a day
|
160
|
+
# Extract basic components
|
161
|
+
df["second"] = df[self.date_column].dt.second
|
162
|
+
df["minute"] = df[self.date_column].dt.minute
|
163
|
+
df["hour"] = df[self.date_column].dt.hour
|
166
164
|
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
165
|
+
# Apply cyclical transformations
|
166
|
+
add_cyclical_features(df, "second", 60) # Seconds in a minute
|
167
|
+
add_cyclical_features(df, "minute", 60) # Minutes in an hour
|
168
|
+
add_cyclical_features(df, "minute", 30) # Minutes in half an hour
|
169
|
+
add_cyclical_features(df, "hour", 24) # Hours in a day
|
170
|
+
|
171
|
+
# Drop intermediate columns if not needed
|
172
|
+
df.drop(columns=["second", "minute", "hour"], inplace=True)
|
173
|
+
else:
|
174
|
+
keep_time = False
|
171
175
|
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
176
|
+
for generated_feature in self.generated_features[:]:
|
177
|
+
if df[generated_feature].dropna().nunique() <= 1:
|
178
|
+
self.logger.warning(f"Generated constant feature {generated_feature} will be dropped")
|
179
|
+
df.drop(columns=generated_feature, inplace=True)
|
180
|
+
self.generated_features.remove(generated_feature)
|
177
181
|
|
178
|
-
|
182
|
+
df.drop(columns=seconds, inplace=True)
|
179
183
|
|
180
184
|
if keep_time:
|
181
185
|
df[self.DATETIME_COL] = df[self.date_column].astype(np.int64) // 1_000_000
|
@@ -247,99 +251,107 @@ def is_time_series(df: pd.DataFrame, date_col: str) -> bool:
|
|
247
251
|
|
248
252
|
|
249
253
|
def is_blocked_time_series(df: pd.DataFrame, date_col: str, search_keys: List[str]) -> bool:
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
df.drop(columns=columns_to_drop, inplace=True)
|
262
|
-
# Date, not datetime
|
263
|
-
if (seconds_without_na != 0).any() and seconds_without_na.nunique() > 1:
|
264
|
-
return False
|
254
|
+
try:
|
255
|
+
df = df.copy()
|
256
|
+
seconds = "datetime_seconds"
|
257
|
+
if isinstance(df[date_col].dtype, pd.PeriodDtype):
|
258
|
+
df[date_col] = df[date_col].dt.to_timestamp()
|
259
|
+
elif is_numeric_dtype(df[date_col]):
|
260
|
+
df[date_col] = pd.to_datetime(df[date_col], unit="ms")
|
261
|
+
else:
|
262
|
+
df[date_col] = pd.to_datetime(df[date_col])
|
263
|
+
df[date_col] = df[date_col].dt.tz_localize(None)
|
264
|
+
df[seconds] = (df[date_col] - df[date_col].dt.floor("D")).dt.seconds
|
265
265
|
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
266
|
+
seconds_without_na = df[seconds].dropna()
|
267
|
+
columns_to_drop = [c for c in search_keys if c != date_col] + [seconds]
|
268
|
+
df.drop(columns=columns_to_drop, inplace=True)
|
269
|
+
# Date, not datetime
|
270
|
+
if (seconds_without_na != 0).any() and seconds_without_na.nunique() > 1:
|
271
|
+
return False
|
270
272
|
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
273
|
+
nunique_dates = df[date_col].nunique()
|
274
|
+
# Unique dates count more than 270
|
275
|
+
if nunique_dates < 270:
|
276
|
+
return False
|
277
|
+
|
278
|
+
min_date = df[date_col].min()
|
279
|
+
max_date = df[date_col].max()
|
280
|
+
days_delta = (max_date - min_date).days + 1
|
281
|
+
# Missing dates less than 30% (unique dates count and days delta between earliest and latest dates)
|
282
|
+
if nunique_dates / days_delta < 0.3:
|
283
|
+
return False
|
277
284
|
|
278
|
-
|
285
|
+
accumulated_changing_columns = set()
|
279
286
|
|
280
|
-
|
281
|
-
|
282
|
-
|
287
|
+
def check_differences(group: pd.DataFrame):
|
288
|
+
changing_columns = group.columns[group.nunique(dropna=False) > 1].to_list()
|
289
|
+
accumulated_changing_columns.update(changing_columns)
|
283
290
|
|
284
|
-
|
285
|
-
|
291
|
+
def is_multiple_rows(group: pd.DataFrame) -> bool:
|
292
|
+
return group.shape[0] > 1
|
286
293
|
|
287
|
-
|
288
|
-
|
294
|
+
grouped = df.groupby(date_col)[[c for c in df.columns if c != date_col]]
|
295
|
+
dates_with_multiple_rows = grouped.apply(is_multiple_rows).sum()
|
289
296
|
|
290
|
-
|
291
|
-
|
292
|
-
|
297
|
+
# share of dates with more than one record is more than 99%
|
298
|
+
if dates_with_multiple_rows / nunique_dates < 0.99:
|
299
|
+
return False
|
293
300
|
|
294
|
-
|
295
|
-
|
301
|
+
if df.shape[1] <= 3:
|
302
|
+
return True
|
296
303
|
|
297
|
-
|
298
|
-
|
304
|
+
grouped.apply(check_differences)
|
305
|
+
return len(accumulated_changing_columns) <= 2
|
306
|
+
except Exception:
|
307
|
+
return False
|
299
308
|
|
300
309
|
|
301
310
|
def is_dates_distribution_valid(
|
302
311
|
df: pd.DataFrame,
|
303
312
|
search_keys: Dict[str, SearchKey],
|
304
313
|
) -> bool:
|
305
|
-
|
314
|
+
try:
|
315
|
+
maybe_date_col = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
|
306
316
|
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
|
317
|
+
if EVAL_SET_INDEX in df.columns:
|
318
|
+
X = df.query(f"{EVAL_SET_INDEX} == 0")
|
319
|
+
else:
|
320
|
+
X = df
|
311
321
|
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
|
322
|
+
if maybe_date_col is None:
|
323
|
+
for col in X.columns:
|
324
|
+
if col in search_keys:
|
325
|
+
continue
|
326
|
+
try:
|
327
|
+
if isinstance(X[col].dtype, pd.PeriodDtype):
|
328
|
+
pass
|
329
|
+
elif pd.__version__ >= "2.0.0":
|
330
|
+
# Format mixed to avoid massive warnings
|
331
|
+
pd.to_datetime(X[col], format="mixed")
|
332
|
+
else:
|
333
|
+
pd.to_datetime(X[col])
|
334
|
+
maybe_date_col = col
|
335
|
+
break
|
336
|
+
except Exception:
|
318
337
|
pass
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
date_counts = dates.value_counts().sort_index()
|
340
|
-
|
341
|
-
date_counts_1 = date_counts[: round(len(date_counts) / 2)]
|
342
|
-
date_counts_2 = date_counts[round(len(date_counts) / 2) :]
|
343
|
-
ratio = date_counts_2.mean() / date_counts_1.mean()
|
344
|
-
|
345
|
-
return ratio >= 0.8 and ratio <= 1.2
|
338
|
+
|
339
|
+
if maybe_date_col is None:
|
340
|
+
return
|
341
|
+
|
342
|
+
if isinstance(X[maybe_date_col].dtype, pd.PeriodDtype):
|
343
|
+
dates = X[maybe_date_col].dt.to_timestamp().dt.date
|
344
|
+
elif pd.__version__ >= "2.0.0":
|
345
|
+
dates = pd.to_datetime(X[maybe_date_col], format="mixed").dt.date
|
346
|
+
else:
|
347
|
+
dates = pd.to_datetime(X[maybe_date_col]).dt.date
|
348
|
+
|
349
|
+
date_counts = dates.value_counts().sort_index()
|
350
|
+
|
351
|
+
date_counts_1 = date_counts[: round(len(date_counts) / 2)]
|
352
|
+
date_counts_2 = date_counts[round(len(date_counts) / 2) :]
|
353
|
+
ratio = date_counts_2.mean() / date_counts_1.mean()
|
354
|
+
|
355
|
+
return ratio >= 0.8 and ratio <= 1.2
|
356
|
+
except Exception:
|
357
|
+
return False
|
@@ -104,9 +104,9 @@ def remove_fintech_duplicates(
|
|
104
104
|
sub_df = pd.merge(sub_df, nonunique_target_rows, on=personal_cols)
|
105
105
|
|
106
106
|
# Convert date columns for further checks
|
107
|
-
sub_df = DateTimeSearchKeyConverter(
|
108
|
-
|
109
|
-
)
|
107
|
+
sub_df = DateTimeSearchKeyConverter(
|
108
|
+
date_col, date_format=date_format, logger=logger, bundle=bundle, generate_cyclical_features=False
|
109
|
+
).convert(sub_df)
|
110
110
|
grouped_by_personal_cols = sub_df.groupby(personal_cols, group_keys=False)
|
111
111
|
rows_with_diff_target = grouped_by_personal_cols.filter(has_diff_target_within_60_days)
|
112
112
|
|
@@ -192,7 +192,7 @@ def clean_full_duplicates(
|
|
192
192
|
unique_columns.remove(TARGET)
|
193
193
|
marked_duplicates = df.duplicated(subset=unique_columns, keep=False)
|
194
194
|
if marked_duplicates.sum() > 0:
|
195
|
-
dups_indices = df[marked_duplicates].index.to_list()
|
195
|
+
dups_indices = df[marked_duplicates].index.to_list()[:100]
|
196
196
|
nrows_after_tgt_dedup = len(df.drop_duplicates(subset=unique_columns, keep=False))
|
197
197
|
num_dup_rows = nrows_after_full_dedup - nrows_after_tgt_dedup
|
198
198
|
share_tgt_dedup = 100 * num_dup_rows / nrows_after_full_dedup
|