upgini 1.2.86.dev1__py3-none-any.whl → 1.2.87__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
upgini/metrics.py CHANGED
@@ -6,16 +6,26 @@ import re
6
6
  from collections import defaultdict
7
7
  from copy import deepcopy
8
8
  from dataclasses import dataclass
9
- from typing import Any, Callable, Dict, List, Literal, Optional, Tuple, Union
9
+ from typing import (
10
+ Any,
11
+ Callable,
12
+ Dict,
13
+ List,
14
+ Literal,
15
+ Optional,
16
+ Protocol,
17
+ Tuple,
18
+ Union,
19
+ runtime_checkable,
20
+ )
10
21
 
11
22
  import lightgbm as lgb
12
23
  import numpy as np
13
24
  import pandas as pd
14
25
  from catboost import CatBoostClassifier, CatBoostRegressor
15
- from category_encoders.cat_boost import CatBoostEncoder
16
26
  from lightgbm import LGBMClassifier, LGBMRegressor
17
27
  from numpy import log1p
18
- from pandas.api.types import is_numeric_dtype, is_integer_dtype, is_float_dtype
28
+ from pandas.api.types import is_float_dtype, is_integer_dtype, is_numeric_dtype
19
29
  from sklearn.metrics import check_scoring, get_scorer, make_scorer, roc_auc_score
20
30
 
21
31
  from upgini.utils.blocked_time_series import BlockedTimeSeriesSplit
@@ -32,10 +42,7 @@ except ImportError:
32
42
  available_scorers = SCORERS
33
43
  from sklearn.metrics import mean_squared_error
34
44
  from sklearn.metrics._regression import _check_reg_targets, check_consistent_length
35
- from sklearn.model_selection import ( # , TimeSeriesSplit
36
- BaseCrossValidator,
37
- TimeSeriesSplit,
38
- )
45
+ from sklearn.model_selection import BaseCrossValidator, TimeSeriesSplit
39
46
 
40
47
  from upgini.errors import ValidationError
41
48
  from upgini.metadata import ModelTaskType
@@ -57,6 +64,16 @@ CATBOOST_REGRESSION_PARAMS = {
57
64
  "allow_writing_files": False,
58
65
  }
59
66
 
67
+ CATBOOST_TS_PARAMS = {
68
+ "learning_rate": 0.05,
69
+ "early_stopping_rounds": 20,
70
+ "use_best_model": True,
71
+ "one_hot_max_size": 100,
72
+ "verbose": False,
73
+ "random_state": 42,
74
+ "allow_writing_files": False,
75
+ }
76
+
60
77
  CATBOOST_BINARY_PARAMS = {
61
78
  "iterations": 250,
62
79
  "learning_rate": 0.05,
@@ -311,6 +328,7 @@ class EstimatorWrapper:
311
328
  self.target_type = target_type
312
329
  self.add_params = add_params
313
330
  self.cv_estimators = None
331
+ self.cv_cat_encoders: Optional[List[Optional[HasTransform]]] = None
314
332
  self.groups = groups
315
333
  self.text_features = text_features
316
334
  self.logger = logger or logging.getLogger()
@@ -391,9 +409,7 @@ class EstimatorWrapper:
391
409
  self.converted_to_int.append(c)
392
410
  self.cat_features.remove(c)
393
411
  elif is_float_dtype(x[c]) or (x[c].dtype == "category" and is_float_dtype(x[c].cat.categories)):
394
- self.logger.info(
395
- f"Convert float cat feature {c} to string"
396
- )
412
+ self.logger.info(f"Convert float cat feature {c} to string")
397
413
  x[c] = x[c].astype(str)
398
414
  self.converted_to_str.append(c)
399
415
  elif x[c].dtype not in ["category", "int64"]:
@@ -439,7 +455,9 @@ class EstimatorWrapper:
439
455
 
440
456
  return x, y, {}
441
457
 
442
- def calculate_shap(self, x: pd.DataFrame, y: pd.Series, estimator) -> Optional[Dict[str, float]]:
458
+ def calculate_shap(
459
+ self, x: pd.DataFrame, y: pd.Series, estimator, cat_encoder: Optional[HasTransform]
460
+ ) -> Optional[Dict[str, float]]:
443
461
  return None
444
462
 
445
463
  def cross_val_predict(
@@ -470,9 +488,11 @@ class EstimatorWrapper:
470
488
  fit_params=fit_params,
471
489
  return_estimator=True,
472
490
  error_score="raise",
491
+ random_state=DEFAULT_RANDOM_STATE,
473
492
  )
474
493
  metrics_by_fold = cv_results["test_score"]
475
494
  self.cv_estimators = cv_results["estimator"]
495
+ self.cv_cat_encoders = cv_results["cat_encoder"]
476
496
 
477
497
  self.check_fold_metrics(metrics_by_fold)
478
498
 
@@ -480,14 +500,14 @@ class EstimatorWrapper:
480
500
 
481
501
  splits = self.cv.split(x, y, groups)
482
502
 
483
- for estimator, split in zip(self.cv_estimators, splits):
503
+ for estimator, cat_encoder, split in zip(self.cv_estimators, self.cv_cat_encoders, splits):
484
504
  _, validation_idx = split
485
505
  cv_x = x.iloc[validation_idx]
486
506
  if isinstance(y, pd.Series):
487
507
  cv_y = y.iloc[validation_idx]
488
508
  else:
489
509
  cv_y = y[validation_idx]
490
- shaps = self.calculate_shap(cv_x, cv_y, estimator)
510
+ shaps = self.calculate_shap(cv_x, cv_y, estimator, cat_encoder)
491
511
  if shaps is not None:
492
512
  for feature, shap_value in shaps.items():
493
513
  shap_values_all_folds[feature].append(shap_value)
@@ -527,8 +547,19 @@ class EstimatorWrapper:
527
547
  metric, metric_std = roc_auc_score(y, x[baseline_score_column]), None
528
548
  else:
529
549
  metrics = []
530
- for est in self.cv_estimators:
531
- metrics.append(self.scorer(est, x, y))
550
+ for est, cat_encoder in zip(self.cv_estimators, self.cv_cat_encoders):
551
+ x_copy = x.copy()
552
+ if cat_encoder is not None:
553
+ if hasattr(cat_encoder, "feature_names_in_"):
554
+ encoded = cat_encoder.transform(x_copy[cat_encoder.feature_names_in_])
555
+ else:
556
+ encoded = cat_encoder.transform(x[self.cat_features])
557
+ if isinstance(self.cv, TimeSeriesSplit) or isinstance(self.cv, BlockedTimeSeriesSplit):
558
+ encoded = encoded.astype(int)
559
+ else:
560
+ encoded = encoded.astype("category")
561
+ x_copy[self.cat_features] = encoded
562
+ metrics.append(self.scorer(est, x_copy, y))
532
563
 
533
564
  metric, metric_std = self._calculate_metric_from_folds(metrics)
534
565
  return _CrossValResults(metric=metric, metric_std=metric_std, shap_values=None)
@@ -551,7 +582,7 @@ class EstimatorWrapper:
551
582
  text_features: Optional[List[str]] = None,
552
583
  add_params: Optional[Dict[str, Any]] = None,
553
584
  groups: Optional[List[str]] = None,
554
- has_date: Optional[bool] = None,
585
+ has_time: bool = False,
555
586
  ) -> EstimatorWrapper:
556
587
  scorer, metric_name, multiplier = define_scorer(target_type, scoring)
557
588
  kwargs = {
@@ -568,7 +599,7 @@ class EstimatorWrapper:
568
599
  if estimator is None:
569
600
  if EstimatorWrapper.default_estimator == "catboost":
570
601
  logger.info("Using CatBoost as default estimator")
571
- params = {"has_time": has_date}
602
+ params = {"has_time": has_time}
572
603
  if target_type == ModelTaskType.MULTICLASS:
573
604
  params = _get_add_params(params, CATBOOST_MULTICLASS_PARAMS)
574
605
  params = _get_add_params(params, add_params)
@@ -578,7 +609,10 @@ class EstimatorWrapper:
578
609
  params = _get_add_params(params, add_params)
579
610
  estimator = CatBoostWrapper(CatBoostClassifier(**params), **kwargs)
580
611
  elif target_type == ModelTaskType.REGRESSION:
581
- params = _get_add_params(params, CATBOOST_REGRESSION_PARAMS)
612
+ if not isinstance(cv, TimeSeriesSplit) and not isinstance(cv, BlockedTimeSeriesSplit):
613
+ params = _get_add_params(params, CATBOOST_TS_PARAMS)
614
+ else:
615
+ params = _get_add_params(params, CATBOOST_REGRESSION_PARAMS)
582
616
  params = _get_add_params(params, add_params)
583
617
  estimator = CatBoostWrapper(CatBoostRegressor(**params), **kwargs)
584
618
  else:
@@ -610,8 +644,8 @@ class EstimatorWrapper:
610
644
  estimator_copy = deepcopy(estimator)
611
645
  kwargs["estimator"] = estimator_copy
612
646
  if is_catboost_estimator(estimator):
613
- if has_date is not None:
614
- estimator_copy.set_params(has_time=has_date)
647
+ if has_time is not None:
648
+ estimator_copy.set_params(has_time=has_time)
615
649
  estimator = CatBoostWrapper(**kwargs)
616
650
  else:
617
651
  if isinstance(estimator, (LGBMClassifier, LGBMRegressor)):
@@ -769,15 +803,26 @@ class CatBoostWrapper(EstimatorWrapper):
769
803
  else:
770
804
  raise e
771
805
 
772
- def calculate_shap(self, x: pd.DataFrame, y: pd.Series, estimator) -> Optional[Dict[str, float]]:
806
+ def calculate_shap(self, x: pd.DataFrame, y: pd.Series, estimator, cat_encoder) -> Optional[Dict[str, float]]:
773
807
  try:
774
808
  from catboost import Pool
775
809
 
810
+ cat_features = None
811
+ if cat_encoder is not None:
812
+ if isinstance(self.cv, TimeSeriesSplit) or isinstance(self.cv, BlockedTimeSeriesSplit):
813
+ encoded = cat_encoder.transform(x[self.cat_features]).astype(int)
814
+ else:
815
+ encoded = cat_encoder.transform(x[self.cat_features])
816
+ cat_features = encoded.columns.to_list()
817
+ x[self.cat_features] = encoded
818
+ else:
819
+ cat_features = self.cat_features
820
+
776
821
  # Create Pool for fold data, if need (for example, when categorical features are present)
777
822
  fold_pool = Pool(
778
823
  x,
779
824
  y,
780
- cat_features=self.cat_features,
825
+ cat_features=cat_features,
781
826
  text_features=self.text_features,
782
827
  embedding_features=self.grouped_embedding_features,
783
828
  )
@@ -834,7 +879,6 @@ class LightGBMWrapper(EstimatorWrapper):
834
879
  text_features=text_features,
835
880
  logger=logger,
836
881
  )
837
- self.cat_encoder = None
838
882
  self.n_classes = None
839
883
 
840
884
  def _prepare_to_fit(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, pd.Series, np.ndarray, dict]:
@@ -846,10 +890,10 @@ class LightGBMWrapper(EstimatorWrapper):
846
890
  params["eval_metric"] = "auc"
847
891
  params["callbacks"] = [lgb.early_stopping(stopping_rounds=LIGHTGBM_EARLY_STOPPING_ROUNDS, verbose=False)]
848
892
  if self.cat_features:
849
- encoder = CatBoostEncoder(random_state=DEFAULT_RANDOM_STATE, cols=self.cat_features, return_df=True)
850
- encoded = encoder.fit_transform(x[self.cat_features].astype("object"), y_numpy).astype("category")
851
- x[self.cat_features] = encoded
852
- self.cat_encoder = encoder
893
+ for c in self.cat_features:
894
+ if x[c].dtype != "category":
895
+ x[c] = x[c].astype("category")
896
+
853
897
  for c in x.columns:
854
898
  if x[c].dtype not in ["category", "int64", "float64", "bool"]:
855
899
  self.logger.warning(f"Feature {c} is not numeric and will be dropped")
@@ -859,15 +903,26 @@ class LightGBMWrapper(EstimatorWrapper):
859
903
 
860
904
  def _prepare_to_calculate(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
861
905
  x, y_numpy, params = super()._prepare_to_calculate(x, y)
862
- if self.cat_features is not None and self.cat_encoder is not None:
863
- encoded = self.cat_encoder.transform(x[self.cat_features].astype("object"), y_numpy).astype("category")
864
- x[self.cat_features] = encoded
906
+ if self.cat_features:
907
+ for c in self.cat_features:
908
+ if x[c].dtype != "category":
909
+ x[c] = x[c].astype("category")
865
910
  return x, y_numpy, params
866
911
 
867
- def calculate_shap(self, x: pd.DataFrame, y: pd.Series, estimator) -> Optional[Dict[str, float]]:
912
+ def calculate_shap(
913
+ self, x: pd.DataFrame, y: pd.Series, estimator, cat_encoder: Optional[HasTransform]
914
+ ) -> Optional[Dict[str, float]]:
868
915
  try:
916
+ x_copy = x.copy()
917
+ if cat_encoder is not None:
918
+ if isinstance(self.cv, TimeSeriesSplit) or isinstance(self.cv, BlockedTimeSeriesSplit):
919
+ encoded = cat_encoder.transform(x_copy[self.cat_features]).astype(int)
920
+ else:
921
+ encoded = cat_encoder.transform(x_copy[self.cat_features]).astype("category")
922
+ x_copy[self.cat_features] = encoded
923
+
869
924
  shap_matrix = estimator.predict(
870
- x,
925
+ x_copy,
871
926
  predict_disable_shape_check=True,
872
927
  raw_score=True,
873
928
  pred_leaf=False,
@@ -926,10 +981,10 @@ class OtherEstimatorWrapper(EstimatorWrapper):
926
981
  num_features = [col for col in x.columns if col not in self.cat_features]
927
982
  x[num_features] = x[num_features].fillna(-999)
928
983
  if self.cat_features:
929
- encoder = CatBoostEncoder(random_state=DEFAULT_RANDOM_STATE, return_df=True)
930
- encoded = encoder.fit_transform(x[self.cat_features].astype("object"), y_numpy).astype("category")
931
- x[self.cat_features] = encoded
932
- self.cat_encoder = encoder
984
+ for c in self.cat_features:
985
+ if x[c].dtype != "category":
986
+ x[c] = x[c].astype("category")
987
+ params["cat_features"] = self.cat_features
933
988
  for c in x.columns:
934
989
  if x[c].dtype not in ["category", "int64", "float64", "bool"]:
935
990
  self.logger.warning(f"Feature {c} is not numeric and will be dropped")
@@ -940,15 +995,22 @@ class OtherEstimatorWrapper(EstimatorWrapper):
940
995
  def _prepare_to_calculate(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
941
996
  x, y_numpy, params = super()._prepare_to_calculate(x, y)
942
997
  if self.cat_features is not None:
998
+ for c in self.cat_features:
999
+ if x[c].dtype != "category":
1000
+ x[c] = x[c].astype("category")
943
1001
  num_features = [col for col in x.columns if col not in self.cat_features]
944
- x[num_features] = x[num_features].fillna(-999)
945
- if self.cat_features and self.cat_encoder is not None:
946
- x[self.cat_features] = self.cat_encoder.transform(
947
- x[self.cat_features].astype("object"), y_numpy
948
- ).astype("category")
1002
+ else:
1003
+ num_features = x.columns
1004
+ x[num_features] = x[num_features].fillna(-999)
1005
+
949
1006
  return x, y_numpy, params
950
1007
 
951
1008
 
1009
+ @runtime_checkable
1010
+ class HasTransform(Protocol):
1011
+ def transform(self, X: pd.DataFrame, y: Optional[Union[pd.Series, np.ndarray]] = None) -> pd.DataFrame: ...
1012
+
1013
+
952
1014
  def validate_scoring_argument(scoring: Union[Callable, str, None]):
953
1015
  if scoring is None:
954
1016
  return
@@ -68,6 +68,8 @@ too_many_generate_features=Too many columns passed in `generate_features` argume
68
68
  invalid_round_embeddings=Argument `round_embeddings` should be non negative integer
69
69
  no_important_features_for_transform=There are no important features for transform. Return input as transformed
70
70
  search_task_not_initial=Passed search_id {} is transform id. Please use search task id of fit call: {}.
71
+ binary_target_unique_count_not_2=Binary target should contain only 2 unique values, but {} found
72
+ binary_target_eval_unique_count_not_2=Binary target should contain only 2 unique values, but {} found in eval_set
71
73
 
72
74
  # Validation errors
73
75
  # params validation
@@ -156,7 +158,7 @@ dataset_too_few_rows=X size should be at least {} rows after validation
156
158
  dataset_too_many_rows_registered=X rows limit for transform is {}. Please sample X
157
159
  dataset_empty_column_names=Some column names are empty. Add names please
158
160
  dataset_full_duplicates={:.5f}% of the rows are fully duplicated
159
- dataset_diff_target_duplicates={:.4f}% of rows ({}) in X and eval_set are duplicates with different y values. These rows will be deleted as incorrect\nIncorrect row indexes: {}
161
+ dataset_diff_target_duplicates={:.4f}% of rows ({}) in X and eval_set are duplicates with different y values. These rows will be deleted as incorrect\nSample of incorrect row indexes: {}
160
162
  dataset_train_diff_target_duplicates_fintech={:.4f}% of rows ({}) in X are duplicates, not taking into consideration dates, IP addresses and features from the training set, but have different y values. These rows have been removed to optimize search results.\nRemoved row indexes: {}
161
163
  dataset_eval_diff_target_duplicates_fintech={:.4f}% of rows ({}) in eval{} X are duplicates, not taking into consideration dates, IP addresses and features from the training set, but have different y values. These rows have been removed to optimize search results.\nRemoved row indexes: {}
162
164
  dataset_drop_old_dates=We don't have data before '2000-01-01' and removed all earlier records from the search dataset
@@ -41,6 +41,7 @@ class DateTimeSearchKeyConverter:
41
41
  date_format: Optional[str] = None,
42
42
  logger: Optional[logging.Logger] = None,
43
43
  bundle: Optional[ResourceBundle] = None,
44
+ generate_cyclical_features: bool = True,
44
45
  ):
45
46
  self.date_column = date_column
46
47
  self.date_format = date_format
@@ -51,6 +52,7 @@ class DateTimeSearchKeyConverter:
51
52
  self.logger.setLevel("FATAL")
52
53
  self.generated_features: List[str] = []
53
54
  self.bundle = bundle or get_custom_bundle()
55
+ self.generate_cyclical_features = generate_cyclical_features
54
56
  self.has_old_dates = False
55
57
 
56
58
  @staticmethod
@@ -121,61 +123,63 @@ class DateTimeSearchKeyConverter:
121
123
  df[cos_feature] = np.cos(2 * np.pi * df[column] / period)
122
124
  self.generated_features.append(cos_feature)
123
125
 
124
- # df["quarter"] = df[self.date_column].dt.quarter
126
+ if self.generate_cyclical_features:
125
127
 
126
- # # Calculate the start date of the quarter for each timestamp
127
- # df["quarter_start"] = df[self.date_column].dt.to_period("Q").dt.start_time
128
+ df["quarter"] = df[self.date_column].dt.quarter
128
129
 
129
- # # Calculate the day in the quarter
130
- # df["day_in_quarter"] = (df[self.date_column] - df["quarter_start"]).dt.days + 1
130
+ # Calculate the start date of the quarter for each timestamp
131
+ df["quarter_start"] = df[self.date_column].dt.to_period("Q").dt.start_time
131
132
 
132
- # # Vectorized calculation of days_in_quarter
133
- # quarter = df["quarter"]
134
- # start = df["quarter_start"]
135
- # year = start.dt.year
136
- # month = start.dt.month
133
+ # Calculate the day in the quarter
134
+ df["day_in_quarter"] = (df[self.date_column] - df["quarter_start"]).dt.days + 1
137
135
 
138
- # quarter_end_year = np.where(quarter == 4, year + 1, year)
139
- # quarter_end_month = np.where(quarter == 4, 1, month + 3)
136
+ # Vectorized calculation of days_in_quarter
137
+ quarter = df["quarter"]
138
+ start = df["quarter_start"]
139
+ year = start.dt.year
140
+ month = start.dt.month
140
141
 
141
- # end = pd.to_datetime({"year": quarter_end_year, "month": quarter_end_month, "day": 1})
142
- # end.index = df.index
142
+ quarter_end_year = np.where(quarter == 4, year + 1, year)
143
+ quarter_end_month = np.where(quarter == 4, 1, month + 3)
143
144
 
144
- # df["days_in_quarter"] = (end - start).dt.days
145
+ end = pd.to_datetime({"year": quarter_end_year, "month": quarter_end_month, "day": 1})
146
+ end.index = df.index
145
147
 
146
- # add_cyclical_features(df, "day_in_quarter", df["days_in_quarter"]) # Days in the quarter
148
+ df["days_in_quarter"] = (end - start).dt.days
147
149
 
148
- # df.drop(columns=["quarter", "quarter_start", "day_in_quarter", "days_in_quarter"], inplace=True)
150
+ add_cyclical_features(df, "day_in_quarter", df["days_in_quarter"]) # Days in the quarter
149
151
 
150
- df[seconds] = (df[self.date_column] - df[self.date_column].dt.floor("D")).dt.seconds
152
+ df.drop(columns=["quarter", "quarter_start", "day_in_quarter", "days_in_quarter"], inplace=True)
151
153
 
152
- seconds_without_na = df[seconds].dropna()
153
- if (seconds_without_na != 0).any() and seconds_without_na.nunique() > 1:
154
- self.logger.info("Time found in date search key. Add extra features based on time")
154
+ df[seconds] = (df[self.date_column] - df[self.date_column].dt.floor("D")).dt.seconds
155
155
 
156
- # Extract basic components
157
- df["second"] = df[self.date_column].dt.second
158
- df["minute"] = df[self.date_column].dt.minute
159
- df["hour"] = df[self.date_column].dt.hour
156
+ seconds_without_na = df[seconds].dropna()
157
+ if (seconds_without_na != 0).any() and seconds_without_na.nunique() > 1:
158
+ self.logger.info("Time found in date search key. Add extra features based on time")
160
159
 
161
- # Apply cyclical transformations
162
- add_cyclical_features(df, "second", 60) # Seconds in a minute
163
- add_cyclical_features(df, "minute", 60) # Minutes in an hour
164
- add_cyclical_features(df, "minute", 30) # Minutes in half an hour
165
- add_cyclical_features(df, "hour", 24) # Hours in a day
160
+ # Extract basic components
161
+ df["second"] = df[self.date_column].dt.second
162
+ df["minute"] = df[self.date_column].dt.minute
163
+ df["hour"] = df[self.date_column].dt.hour
166
164
 
167
- # Drop intermediate columns if not needed
168
- df.drop(columns=["second", "minute", "hour"], inplace=True)
169
- else:
170
- keep_time = False
165
+ # Apply cyclical transformations
166
+ add_cyclical_features(df, "second", 60) # Seconds in a minute
167
+ add_cyclical_features(df, "minute", 60) # Minutes in an hour
168
+ add_cyclical_features(df, "minute", 30) # Minutes in half an hour
169
+ add_cyclical_features(df, "hour", 24) # Hours in a day
170
+
171
+ # Drop intermediate columns if not needed
172
+ df.drop(columns=["second", "minute", "hour"], inplace=True)
173
+ else:
174
+ keep_time = False
171
175
 
172
- for generated_feature in self.generated_features[:]:
173
- if df[generated_feature].dropna().nunique() <= 1:
174
- self.logger.warning(f"Generated constant feature {generated_feature} will be dropped")
175
- df.drop(columns=generated_feature, inplace=True)
176
- self.generated_features.remove(generated_feature)
176
+ for generated_feature in self.generated_features[:]:
177
+ if df[generated_feature].dropna().nunique() <= 1:
178
+ self.logger.warning(f"Generated constant feature {generated_feature} will be dropped")
179
+ df.drop(columns=generated_feature, inplace=True)
180
+ self.generated_features.remove(generated_feature)
177
181
 
178
- df.drop(columns=seconds, inplace=True)
182
+ df.drop(columns=seconds, inplace=True)
179
183
 
180
184
  if keep_time:
181
185
  df[self.DATETIME_COL] = df[self.date_column].astype(np.int64) // 1_000_000
@@ -247,99 +251,107 @@ def is_time_series(df: pd.DataFrame, date_col: str) -> bool:
247
251
 
248
252
 
249
253
  def is_blocked_time_series(df: pd.DataFrame, date_col: str, search_keys: List[str]) -> bool:
250
- df = df.copy()
251
- seconds = "datetime_seconds"
252
- if isinstance(df[date_col].dtype, pd.PeriodDtype):
253
- df[date_col] = df[date_col].dt.to_timestamp()
254
- else:
255
- df[date_col] = pd.to_datetime(df[date_col])
256
- df[date_col] = df[date_col].dt.tz_localize(None)
257
- df[seconds] = (df[date_col] - df[date_col].dt.floor("D")).dt.seconds
258
-
259
- seconds_without_na = df[seconds].dropna()
260
- columns_to_drop = [c for c in search_keys if c != date_col] + [seconds]
261
- df.drop(columns=columns_to_drop, inplace=True)
262
- # Date, not datetime
263
- if (seconds_without_na != 0).any() and seconds_without_na.nunique() > 1:
264
- return False
254
+ try:
255
+ df = df.copy()
256
+ seconds = "datetime_seconds"
257
+ if isinstance(df[date_col].dtype, pd.PeriodDtype):
258
+ df[date_col] = df[date_col].dt.to_timestamp()
259
+ elif is_numeric_dtype(df[date_col]):
260
+ df[date_col] = pd.to_datetime(df[date_col], unit="ms")
261
+ else:
262
+ df[date_col] = pd.to_datetime(df[date_col])
263
+ df[date_col] = df[date_col].dt.tz_localize(None)
264
+ df[seconds] = (df[date_col] - df[date_col].dt.floor("D")).dt.seconds
265
265
 
266
- nunique_dates = df[date_col].nunique()
267
- # Unique dates count more than 270
268
- if nunique_dates < 270:
269
- return False
266
+ seconds_without_na = df[seconds].dropna()
267
+ columns_to_drop = [c for c in search_keys if c != date_col] + [seconds]
268
+ df.drop(columns=columns_to_drop, inplace=True)
269
+ # Date, not datetime
270
+ if (seconds_without_na != 0).any() and seconds_without_na.nunique() > 1:
271
+ return False
270
272
 
271
- min_date = df[date_col].min()
272
- max_date = df[date_col].max()
273
- days_delta = (max_date - min_date).days + 1
274
- # Missing dates less than 30% (unique dates count and days delta between earliest and latest dates)
275
- if nunique_dates / days_delta < 0.3:
276
- return False
273
+ nunique_dates = df[date_col].nunique()
274
+ # Unique dates count more than 270
275
+ if nunique_dates < 270:
276
+ return False
277
+
278
+ min_date = df[date_col].min()
279
+ max_date = df[date_col].max()
280
+ days_delta = (max_date - min_date).days + 1
281
+ # Missing dates less than 30% (unique dates count and days delta between earliest and latest dates)
282
+ if nunique_dates / days_delta < 0.3:
283
+ return False
277
284
 
278
- accumulated_changing_columns = set()
285
+ accumulated_changing_columns = set()
279
286
 
280
- def check_differences(group: pd.DataFrame):
281
- changing_columns = group.columns[group.nunique(dropna=False) > 1].to_list()
282
- accumulated_changing_columns.update(changing_columns)
287
+ def check_differences(group: pd.DataFrame):
288
+ changing_columns = group.columns[group.nunique(dropna=False) > 1].to_list()
289
+ accumulated_changing_columns.update(changing_columns)
283
290
 
284
- def is_multiple_rows(group: pd.DataFrame) -> bool:
285
- return group.shape[0] > 1
291
+ def is_multiple_rows(group: pd.DataFrame) -> bool:
292
+ return group.shape[0] > 1
286
293
 
287
- grouped = df.groupby(date_col)[[c for c in df.columns if c != date_col]]
288
- dates_with_multiple_rows = grouped.apply(is_multiple_rows).sum()
294
+ grouped = df.groupby(date_col)[[c for c in df.columns if c != date_col]]
295
+ dates_with_multiple_rows = grouped.apply(is_multiple_rows).sum()
289
296
 
290
- # share of dates with more than one record is more than 99%
291
- if dates_with_multiple_rows / nunique_dates < 0.99:
292
- return False
297
+ # share of dates with more than one record is more than 99%
298
+ if dates_with_multiple_rows / nunique_dates < 0.99:
299
+ return False
293
300
 
294
- if df.shape[1] <= 3:
295
- return True
301
+ if df.shape[1] <= 3:
302
+ return True
296
303
 
297
- grouped.apply(check_differences)
298
- return len(accumulated_changing_columns) <= 2
304
+ grouped.apply(check_differences)
305
+ return len(accumulated_changing_columns) <= 2
306
+ except Exception:
307
+ return False
299
308
 
300
309
 
301
310
  def is_dates_distribution_valid(
302
311
  df: pd.DataFrame,
303
312
  search_keys: Dict[str, SearchKey],
304
313
  ) -> bool:
305
- maybe_date_col = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
314
+ try:
315
+ maybe_date_col = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
306
316
 
307
- if EVAL_SET_INDEX in df.columns:
308
- X = df.query(f"{EVAL_SET_INDEX} == 0")
309
- else:
310
- X = df
317
+ if EVAL_SET_INDEX in df.columns:
318
+ X = df.query(f"{EVAL_SET_INDEX} == 0")
319
+ else:
320
+ X = df
311
321
 
312
- if maybe_date_col is None:
313
- for col in X.columns:
314
- if col in search_keys:
315
- continue
316
- try:
317
- if isinstance(X[col].dtype, pd.PeriodDtype):
322
+ if maybe_date_col is None:
323
+ for col in X.columns:
324
+ if col in search_keys:
325
+ continue
326
+ try:
327
+ if isinstance(X[col].dtype, pd.PeriodDtype):
328
+ pass
329
+ elif pd.__version__ >= "2.0.0":
330
+ # Format mixed to avoid massive warnings
331
+ pd.to_datetime(X[col], format="mixed")
332
+ else:
333
+ pd.to_datetime(X[col])
334
+ maybe_date_col = col
335
+ break
336
+ except Exception:
318
337
  pass
319
- elif pd.__version__ >= "2.0.0":
320
- # Format mixed to avoid massive warnings
321
- pd.to_datetime(X[col], format="mixed")
322
- else:
323
- pd.to_datetime(X[col])
324
- maybe_date_col = col
325
- break
326
- except Exception:
327
- pass
328
-
329
- if maybe_date_col is None:
330
- return
331
-
332
- if isinstance(X[maybe_date_col].dtype, pd.PeriodDtype):
333
- dates = X[maybe_date_col].dt.to_timestamp().dt.date
334
- elif pd.__version__ >= "2.0.0":
335
- dates = pd.to_datetime(X[maybe_date_col], format="mixed").dt.date
336
- else:
337
- dates = pd.to_datetime(X[maybe_date_col]).dt.date
338
-
339
- date_counts = dates.value_counts().sort_index()
340
-
341
- date_counts_1 = date_counts[: round(len(date_counts) / 2)]
342
- date_counts_2 = date_counts[round(len(date_counts) / 2) :]
343
- ratio = date_counts_2.mean() / date_counts_1.mean()
344
-
345
- return ratio >= 0.8 and ratio <= 1.2
338
+
339
+ if maybe_date_col is None:
340
+ return
341
+
342
+ if isinstance(X[maybe_date_col].dtype, pd.PeriodDtype):
343
+ dates = X[maybe_date_col].dt.to_timestamp().dt.date
344
+ elif pd.__version__ >= "2.0.0":
345
+ dates = pd.to_datetime(X[maybe_date_col], format="mixed").dt.date
346
+ else:
347
+ dates = pd.to_datetime(X[maybe_date_col]).dt.date
348
+
349
+ date_counts = dates.value_counts().sort_index()
350
+
351
+ date_counts_1 = date_counts[: round(len(date_counts) / 2)]
352
+ date_counts_2 = date_counts[round(len(date_counts) / 2) :]
353
+ ratio = date_counts_2.mean() / date_counts_1.mean()
354
+
355
+ return ratio >= 0.8 and ratio <= 1.2
356
+ except Exception:
357
+ return False
@@ -104,9 +104,9 @@ def remove_fintech_duplicates(
104
104
  sub_df = pd.merge(sub_df, nonunique_target_rows, on=personal_cols)
105
105
 
106
106
  # Convert date columns for further checks
107
- sub_df = DateTimeSearchKeyConverter(date_col, date_format=date_format, logger=logger, bundle=bundle).convert(
108
- sub_df
109
- )
107
+ sub_df = DateTimeSearchKeyConverter(
108
+ date_col, date_format=date_format, logger=logger, bundle=bundle, generate_cyclical_features=False
109
+ ).convert(sub_df)
110
110
  grouped_by_personal_cols = sub_df.groupby(personal_cols, group_keys=False)
111
111
  rows_with_diff_target = grouped_by_personal_cols.filter(has_diff_target_within_60_days)
112
112
 
@@ -192,7 +192,7 @@ def clean_full_duplicates(
192
192
  unique_columns.remove(TARGET)
193
193
  marked_duplicates = df.duplicated(subset=unique_columns, keep=False)
194
194
  if marked_duplicates.sum() > 0:
195
- dups_indices = df[marked_duplicates].index.to_list()
195
+ dups_indices = df[marked_duplicates].index.to_list()[:100]
196
196
  nrows_after_tgt_dedup = len(df.drop_duplicates(subset=unique_columns, keep=False))
197
197
  num_dup_rows = nrows_after_full_dedup - nrows_after_tgt_dedup
198
198
  share_tgt_dedup = 100 * num_dup_rows / nrows_after_full_dedup