upgini 1.2.86.dev1__py3-none-any.whl → 1.2.87.dev3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
upgini/__about__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "1.2.86.dev1"
1
+ __version__ = "1.2.87.dev3"
@@ -30,7 +30,7 @@ from pandas.api.types import (
30
30
  from scipy.stats import ks_2samp
31
31
  from sklearn.base import TransformerMixin
32
32
  from sklearn.exceptions import NotFittedError
33
- from sklearn.model_selection import BaseCrossValidator
33
+ from sklearn.model_selection import BaseCrossValidator, TimeSeriesSplit
34
34
 
35
35
  from upgini.autofe.feature import Feature
36
36
  from upgini.autofe.timeseries import TimeSeriesBase
@@ -71,6 +71,7 @@ from upgini.resource_bundle import ResourceBundle, bundle, get_custom_bundle
71
71
  from upgini.search_task import SearchTask
72
72
  from upgini.spinner import Spinner
73
73
  from upgini.utils import combine_search_keys, find_numbers_with_decimal_comma
74
+ from upgini.utils.blocked_time_series import BlockedTimeSeriesSplit
74
75
  from upgini.utils.country_utils import (
75
76
  CountrySearchKeyConverter,
76
77
  CountrySearchKeyDetector,
@@ -114,7 +115,9 @@ from upgini.utils.postal_code_utils import (
114
115
  try:
115
116
  from upgini.utils.progress_bar import CustomProgressBar as ProgressBar
116
117
  except Exception:
117
- from upgini.utils.fallback_progress_bar import CustomFallbackProgressBar as ProgressBar
118
+ from upgini.utils.fallback_progress_bar import (
119
+ CustomFallbackProgressBar as ProgressBar,
120
+ )
118
121
 
119
122
  from upgini.utils.sort import sort_columns
120
123
  from upgini.utils.target_utils import (
@@ -239,6 +242,7 @@ class FeaturesEnricher(TransformerMixin):
239
242
  add_date_if_missing: bool = True,
240
243
  disable_force_downsampling: bool = False,
241
244
  id_columns: Optional[List[str]] = None,
245
+ generate_search_key_features: bool = True,
242
246
  **kwargs,
243
247
  ):
244
248
  self.bundle = get_custom_bundle(custom_bundle_config)
@@ -365,6 +369,8 @@ class FeaturesEnricher(TransformerMixin):
365
369
  self.exclude_columns = exclude_columns
366
370
  self.baseline_score_column = baseline_score_column
367
371
  self.add_date_if_missing = add_date_if_missing
372
+ self.generate_search_key_features = generate_search_key_features
373
+
368
374
  self.features_info_display_handle = None
369
375
  self.data_sources_display_handle = None
370
376
  self.autofe_features_display_handle = None
@@ -1045,6 +1051,7 @@ class FeaturesEnricher(TransformerMixin):
1045
1051
  self._check_train_and_eval_target_distribution(y_sorted, fitting_eval_set_dict)
1046
1052
 
1047
1053
  has_date = self._get_date_column(search_keys) is not None
1054
+ has_time = has_date and isinstance(_cv, TimeSeriesSplit) or isinstance(_cv, BlockedTimeSeriesSplit)
1048
1055
  model_task_type = self.model_task_type or define_task(y_sorted, has_date, self.logger, silent=True)
1049
1056
  cat_features = list(set(client_cat_features + cat_features_from_backend))
1050
1057
  baseline_cat_features = [f for f in cat_features if f in fitting_X.columns]
@@ -1077,7 +1084,7 @@ class FeaturesEnricher(TransformerMixin):
1077
1084
  add_params=custom_loss_add_params,
1078
1085
  groups=groups,
1079
1086
  text_features=text_features,
1080
- has_date=has_date,
1087
+ has_time=has_time,
1081
1088
  )
1082
1089
  baseline_cv_result = baseline_estimator.cross_val_predict(
1083
1090
  fitting_X, y_sorted, baseline_score_column
@@ -1112,7 +1119,7 @@ class FeaturesEnricher(TransformerMixin):
1112
1119
  add_params=custom_loss_add_params,
1113
1120
  groups=groups,
1114
1121
  text_features=text_features,
1115
- has_date=has_date,
1122
+ has_time=has_time,
1116
1123
  )
1117
1124
  enriched_cv_result = enriched_estimator.cross_val_predict(fitting_enriched_X, enriched_y_sorted)
1118
1125
  enriched_metric = enriched_cv_result.get_display_metric()
@@ -1773,7 +1780,13 @@ class FeaturesEnricher(TransformerMixin):
1773
1780
  date_column = self._get_date_column(search_keys)
1774
1781
  generated_features = []
1775
1782
  if date_column is not None:
1776
- converter = DateTimeSearchKeyConverter(date_column, self.date_format, self.logger, self.bundle)
1783
+ converter = DateTimeSearchKeyConverter(
1784
+ date_column,
1785
+ self.date_format,
1786
+ self.logger,
1787
+ self.bundle,
1788
+ generate_cyclical_features=self.generate_search_key_features,
1789
+ )
1777
1790
  # Leave original date column values
1778
1791
  df_with_date_features = converter.convert(df, keep_time=True)
1779
1792
  df_with_date_features[date_column] = df[date_column]
@@ -1781,7 +1794,7 @@ class FeaturesEnricher(TransformerMixin):
1781
1794
  generated_features = converter.generated_features
1782
1795
 
1783
1796
  email_columns = SearchKey.find_all_keys(search_keys, SearchKey.EMAIL)
1784
- if email_columns:
1797
+ if email_columns and self.generate_search_key_features:
1785
1798
  generator = EmailDomainGenerator(email_columns)
1786
1799
  df = generator.generate(df)
1787
1800
  generated_features.extend(generator.generated_features)
@@ -2204,10 +2217,12 @@ class FeaturesEnricher(TransformerMixin):
2204
2217
  {"name": name, "value": key_example(sk_type)} for name in sk_meta.unnestKeyNames
2205
2218
  ]
2206
2219
  else:
2207
- search_keys_with_values[sk_type.name] = [{
2208
- "name": sk_meta.originalName,
2209
- "value": key_example(sk_type),
2210
- }]
2220
+ search_keys_with_values[sk_type.name] = [
2221
+ {
2222
+ "name": sk_meta.originalName,
2223
+ "value": key_example(sk_type),
2224
+ }
2225
+ ]
2211
2226
 
2212
2227
  keys_section = json.dumps(search_keys_with_values)
2213
2228
  features_for_transform = self._search_task.get_features_for_transform()
@@ -2360,7 +2375,13 @@ if response.status_code == 200:
2360
2375
  generated_features = []
2361
2376
  date_column = self._get_date_column(search_keys)
2362
2377
  if date_column is not None:
2363
- converter = DateTimeSearchKeyConverter(date_column, self.date_format, self.logger, bundle=self.bundle)
2378
+ converter = DateTimeSearchKeyConverter(
2379
+ date_column,
2380
+ self.date_format,
2381
+ self.logger,
2382
+ bundle=self.bundle,
2383
+ generate_cyclical_features=self.generate_search_key_features,
2384
+ )
2364
2385
  df = converter.convert(df, keep_time=True)
2365
2386
  self.logger.info(f"Date column after convertion: {df[date_column]}")
2366
2387
  generated_features.extend(converter.generated_features)
@@ -2370,7 +2391,7 @@ if response.status_code == 200:
2370
2391
  df = self._add_current_date_as_key(df, search_keys, self.logger, self.bundle)
2371
2392
 
2372
2393
  email_columns = SearchKey.find_all_keys(search_keys, SearchKey.EMAIL)
2373
- if email_columns:
2394
+ if email_columns and self.generate_search_key_features:
2374
2395
  generator = EmailDomainGenerator(email_columns)
2375
2396
  df = generator.generate(df)
2376
2397
  generated_features.extend(generator.generated_features)
@@ -2860,6 +2881,7 @@ if response.status_code == 200:
2860
2881
  self.date_format,
2861
2882
  self.logger,
2862
2883
  bundle=self.bundle,
2884
+ generate_cyclical_features=self.generate_search_key_features,
2863
2885
  )
2864
2886
  df = converter.convert(df, keep_time=True)
2865
2887
  if converter.has_old_dates:
@@ -2872,7 +2894,7 @@ if response.status_code == 200:
2872
2894
  df = self._add_current_date_as_key(df, self.fit_search_keys, self.logger, self.bundle)
2873
2895
 
2874
2896
  email_columns = SearchKey.find_all_keys(self.fit_search_keys, SearchKey.EMAIL)
2875
- if email_columns:
2897
+ if email_columns and self.generate_search_key_features:
2876
2898
  generator = EmailDomainGenerator(email_columns)
2877
2899
  df = generator.generate(df)
2878
2900
  self.fit_generated_features.extend(generator.generated_features)
@@ -3564,7 +3586,9 @@ if response.status_code == 200:
3564
3586
  maybe_date_col = SearchKey.find_key(self.search_keys, [SearchKey.DATE, SearchKey.DATETIME])
3565
3587
  if X is not None and maybe_date_col is not None and maybe_date_col in X.columns:
3566
3588
  # TODO cast date column to single dtype
3567
- date_converter = DateTimeSearchKeyConverter(maybe_date_col, self.date_format)
3589
+ date_converter = DateTimeSearchKeyConverter(
3590
+ maybe_date_col, self.date_format, generate_cyclical_features=False
3591
+ )
3568
3592
  converted_X = date_converter.convert(X)
3569
3593
  min_date = converted_X[maybe_date_col].min()
3570
3594
  max_date = converted_X[maybe_date_col].max()
@@ -3603,7 +3627,7 @@ if response.status_code == 200:
3603
3627
  self.__log_warning(bundle.get("current_date_added"))
3604
3628
  df[FeaturesEnricher.CURRENT_DATE] = datetime.date.today()
3605
3629
  search_keys[FeaturesEnricher.CURRENT_DATE] = SearchKey.DATE
3606
- converter = DateTimeSearchKeyConverter(FeaturesEnricher.CURRENT_DATE)
3630
+ converter = DateTimeSearchKeyConverter(FeaturesEnricher.CURRENT_DATE, generate_cyclical_features=False)
3607
3631
  df = converter.convert(df)
3608
3632
  return df
3609
3633
 
upgini/metrics.py CHANGED
@@ -6,16 +6,26 @@ import re
6
6
  from collections import defaultdict
7
7
  from copy import deepcopy
8
8
  from dataclasses import dataclass
9
- from typing import Any, Callable, Dict, List, Literal, Optional, Tuple, Union
9
+ from typing import (
10
+ Any,
11
+ Callable,
12
+ Dict,
13
+ List,
14
+ Literal,
15
+ Optional,
16
+ Protocol,
17
+ Tuple,
18
+ Union,
19
+ runtime_checkable,
20
+ )
10
21
 
11
22
  import lightgbm as lgb
12
23
  import numpy as np
13
24
  import pandas as pd
14
25
  from catboost import CatBoostClassifier, CatBoostRegressor
15
- from category_encoders.cat_boost import CatBoostEncoder
16
26
  from lightgbm import LGBMClassifier, LGBMRegressor
17
27
  from numpy import log1p
18
- from pandas.api.types import is_numeric_dtype, is_integer_dtype, is_float_dtype
28
+ from pandas.api.types import is_float_dtype, is_integer_dtype, is_numeric_dtype
19
29
  from sklearn.metrics import check_scoring, get_scorer, make_scorer, roc_auc_score
20
30
 
21
31
  from upgini.utils.blocked_time_series import BlockedTimeSeriesSplit
@@ -32,10 +42,7 @@ except ImportError:
32
42
  available_scorers = SCORERS
33
43
  from sklearn.metrics import mean_squared_error
34
44
  from sklearn.metrics._regression import _check_reg_targets, check_consistent_length
35
- from sklearn.model_selection import ( # , TimeSeriesSplit
36
- BaseCrossValidator,
37
- TimeSeriesSplit,
38
- )
45
+ from sklearn.model_selection import BaseCrossValidator, TimeSeriesSplit
39
46
 
40
47
  from upgini.errors import ValidationError
41
48
  from upgini.metadata import ModelTaskType
@@ -57,6 +64,16 @@ CATBOOST_REGRESSION_PARAMS = {
57
64
  "allow_writing_files": False,
58
65
  }
59
66
 
67
+ CATBOOST_TS_PARAMS = {
68
+ "learning_rate": 0.05,
69
+ "early_stopping_rounds": 20,
70
+ "use_best_model": True,
71
+ "one_hot_max_size": 100,
72
+ "verbose": False,
73
+ "random_state": 42,
74
+ "allow_writing_files": False,
75
+ }
76
+
60
77
  CATBOOST_BINARY_PARAMS = {
61
78
  "iterations": 250,
62
79
  "learning_rate": 0.05,
@@ -311,6 +328,7 @@ class EstimatorWrapper:
311
328
  self.target_type = target_type
312
329
  self.add_params = add_params
313
330
  self.cv_estimators = None
331
+ self.cv_cat_encoders: Optional[List[Optional[HasTransform]]] = None
314
332
  self.groups = groups
315
333
  self.text_features = text_features
316
334
  self.logger = logger or logging.getLogger()
@@ -391,9 +409,7 @@ class EstimatorWrapper:
391
409
  self.converted_to_int.append(c)
392
410
  self.cat_features.remove(c)
393
411
  elif is_float_dtype(x[c]) or (x[c].dtype == "category" and is_float_dtype(x[c].cat.categories)):
394
- self.logger.info(
395
- f"Convert float cat feature {c} to string"
396
- )
412
+ self.logger.info(f"Convert float cat feature {c} to string")
397
413
  x[c] = x[c].astype(str)
398
414
  self.converted_to_str.append(c)
399
415
  elif x[c].dtype not in ["category", "int64"]:
@@ -439,7 +455,9 @@ class EstimatorWrapper:
439
455
 
440
456
  return x, y, {}
441
457
 
442
- def calculate_shap(self, x: pd.DataFrame, y: pd.Series, estimator) -> Optional[Dict[str, float]]:
458
+ def calculate_shap(
459
+ self, x: pd.DataFrame, y: pd.Series, estimator, cat_encoder: Optional[HasTransform]
460
+ ) -> Optional[Dict[str, float]]:
443
461
  return None
444
462
 
445
463
  def cross_val_predict(
@@ -470,9 +488,11 @@ class EstimatorWrapper:
470
488
  fit_params=fit_params,
471
489
  return_estimator=True,
472
490
  error_score="raise",
491
+ random_state=DEFAULT_RANDOM_STATE,
473
492
  )
474
493
  metrics_by_fold = cv_results["test_score"]
475
494
  self.cv_estimators = cv_results["estimator"]
495
+ self.cv_cat_encoders = cv_results["cat_encoder"]
476
496
 
477
497
  self.check_fold_metrics(metrics_by_fold)
478
498
 
@@ -480,14 +500,14 @@ class EstimatorWrapper:
480
500
 
481
501
  splits = self.cv.split(x, y, groups)
482
502
 
483
- for estimator, split in zip(self.cv_estimators, splits):
503
+ for estimator, cat_encoder, split in zip(self.cv_estimators, self.cv_cat_encoders, splits):
484
504
  _, validation_idx = split
485
505
  cv_x = x.iloc[validation_idx]
486
506
  if isinstance(y, pd.Series):
487
507
  cv_y = y.iloc[validation_idx]
488
508
  else:
489
509
  cv_y = y[validation_idx]
490
- shaps = self.calculate_shap(cv_x, cv_y, estimator)
510
+ shaps = self.calculate_shap(cv_x, cv_y, estimator, cat_encoder)
491
511
  if shaps is not None:
492
512
  for feature, shap_value in shaps.items():
493
513
  shap_values_all_folds[feature].append(shap_value)
@@ -527,8 +547,19 @@ class EstimatorWrapper:
527
547
  metric, metric_std = roc_auc_score(y, x[baseline_score_column]), None
528
548
  else:
529
549
  metrics = []
530
- for est in self.cv_estimators:
531
- metrics.append(self.scorer(est, x, y))
550
+ for est, cat_encoder in zip(self.cv_estimators, self.cv_cat_encoders):
551
+ x_copy = x.copy()
552
+ if cat_encoder is not None:
553
+ if hasattr(cat_encoder, "feature_names_in_"):
554
+ encoded = cat_encoder.transform(x_copy[cat_encoder.feature_names_in_])
555
+ else:
556
+ encoded = cat_encoder.transform(x[self.cat_features])
557
+ if isinstance(self.cv, TimeSeriesSplit) or isinstance(self.cv, BlockedTimeSeriesSplit):
558
+ encoded = encoded.astype(int)
559
+ else:
560
+ encoded = encoded.astype("category")
561
+ x_copy[self.cat_features] = encoded
562
+ metrics.append(self.scorer(est, x_copy, y))
532
563
 
533
564
  metric, metric_std = self._calculate_metric_from_folds(metrics)
534
565
  return _CrossValResults(metric=metric, metric_std=metric_std, shap_values=None)
@@ -551,7 +582,7 @@ class EstimatorWrapper:
551
582
  text_features: Optional[List[str]] = None,
552
583
  add_params: Optional[Dict[str, Any]] = None,
553
584
  groups: Optional[List[str]] = None,
554
- has_date: Optional[bool] = None,
585
+ has_time: bool = False,
555
586
  ) -> EstimatorWrapper:
556
587
  scorer, metric_name, multiplier = define_scorer(target_type, scoring)
557
588
  kwargs = {
@@ -568,7 +599,7 @@ class EstimatorWrapper:
568
599
  if estimator is None:
569
600
  if EstimatorWrapper.default_estimator == "catboost":
570
601
  logger.info("Using CatBoost as default estimator")
571
- params = {"has_time": has_date}
602
+ params = {"has_time": has_time}
572
603
  if target_type == ModelTaskType.MULTICLASS:
573
604
  params = _get_add_params(params, CATBOOST_MULTICLASS_PARAMS)
574
605
  params = _get_add_params(params, add_params)
@@ -578,7 +609,10 @@ class EstimatorWrapper:
578
609
  params = _get_add_params(params, add_params)
579
610
  estimator = CatBoostWrapper(CatBoostClassifier(**params), **kwargs)
580
611
  elif target_type == ModelTaskType.REGRESSION:
581
- params = _get_add_params(params, CATBOOST_REGRESSION_PARAMS)
612
+ if not isinstance(cv, TimeSeriesSplit) and not isinstance(cv, BlockedTimeSeriesSplit):
613
+ params = _get_add_params(params, CATBOOST_TS_PARAMS)
614
+ else:
615
+ params = _get_add_params(params, CATBOOST_REGRESSION_PARAMS)
582
616
  params = _get_add_params(params, add_params)
583
617
  estimator = CatBoostWrapper(CatBoostRegressor(**params), **kwargs)
584
618
  else:
@@ -610,8 +644,8 @@ class EstimatorWrapper:
610
644
  estimator_copy = deepcopy(estimator)
611
645
  kwargs["estimator"] = estimator_copy
612
646
  if is_catboost_estimator(estimator):
613
- if has_date is not None:
614
- estimator_copy.set_params(has_time=has_date)
647
+ if has_time is not None:
648
+ estimator_copy.set_params(has_time=has_time)
615
649
  estimator = CatBoostWrapper(**kwargs)
616
650
  else:
617
651
  if isinstance(estimator, (LGBMClassifier, LGBMRegressor)):
@@ -769,15 +803,24 @@ class CatBoostWrapper(EstimatorWrapper):
769
803
  else:
770
804
  raise e
771
805
 
772
- def calculate_shap(self, x: pd.DataFrame, y: pd.Series, estimator) -> Optional[Dict[str, float]]:
806
+ def calculate_shap(self, x: pd.DataFrame, y: pd.Series, estimator, cat_encoder) -> Optional[Dict[str, float]]:
773
807
  try:
774
808
  from catboost import Pool
775
809
 
810
+ if cat_encoder is not None:
811
+ if isinstance(self.cv, TimeSeriesSplit) or isinstance(self.cv, BlockedTimeSeriesSplit):
812
+ encoded = cat_encoder.transform(x[self.cat_features]).astype(int)
813
+ cat_features = None
814
+ else:
815
+ encoded = cat_encoder.transform(x[self.cat_features])
816
+ cat_features = encoded.columns.to_list()
817
+ x[self.cat_features] = encoded
818
+
776
819
  # Create Pool for fold data, if need (for example, when categorical features are present)
777
820
  fold_pool = Pool(
778
821
  x,
779
822
  y,
780
- cat_features=self.cat_features,
823
+ cat_features=cat_features,
781
824
  text_features=self.text_features,
782
825
  embedding_features=self.grouped_embedding_features,
783
826
  )
@@ -834,7 +877,6 @@ class LightGBMWrapper(EstimatorWrapper):
834
877
  text_features=text_features,
835
878
  logger=logger,
836
879
  )
837
- self.cat_encoder = None
838
880
  self.n_classes = None
839
881
 
840
882
  def _prepare_to_fit(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, pd.Series, np.ndarray, dict]:
@@ -846,10 +888,10 @@ class LightGBMWrapper(EstimatorWrapper):
846
888
  params["eval_metric"] = "auc"
847
889
  params["callbacks"] = [lgb.early_stopping(stopping_rounds=LIGHTGBM_EARLY_STOPPING_ROUNDS, verbose=False)]
848
890
  if self.cat_features:
849
- encoder = CatBoostEncoder(random_state=DEFAULT_RANDOM_STATE, cols=self.cat_features, return_df=True)
850
- encoded = encoder.fit_transform(x[self.cat_features].astype("object"), y_numpy).astype("category")
851
- x[self.cat_features] = encoded
852
- self.cat_encoder = encoder
891
+ for c in self.cat_features:
892
+ if x[c].dtype != "category":
893
+ x[c] = x[c].astype("category")
894
+
853
895
  for c in x.columns:
854
896
  if x[c].dtype not in ["category", "int64", "float64", "bool"]:
855
897
  self.logger.warning(f"Feature {c} is not numeric and will be dropped")
@@ -859,15 +901,26 @@ class LightGBMWrapper(EstimatorWrapper):
859
901
 
860
902
  def _prepare_to_calculate(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
861
903
  x, y_numpy, params = super()._prepare_to_calculate(x, y)
862
- if self.cat_features is not None and self.cat_encoder is not None:
863
- encoded = self.cat_encoder.transform(x[self.cat_features].astype("object"), y_numpy).astype("category")
864
- x[self.cat_features] = encoded
904
+ if self.cat_features:
905
+ for c in self.cat_features:
906
+ if x[c].dtype != "category":
907
+ x[c] = x[c].astype("category")
865
908
  return x, y_numpy, params
866
909
 
867
- def calculate_shap(self, x: pd.DataFrame, y: pd.Series, estimator) -> Optional[Dict[str, float]]:
910
+ def calculate_shap(
911
+ self, x: pd.DataFrame, y: pd.Series, estimator, cat_encoder: Optional[HasTransform]
912
+ ) -> Optional[Dict[str, float]]:
868
913
  try:
914
+ x_copy = x.copy()
915
+ if cat_encoder is not None:
916
+ if isinstance(self.cv, TimeSeriesSplit) or isinstance(self.cv, BlockedTimeSeriesSplit):
917
+ encoded = cat_encoder.transform(x_copy[self.cat_features]).astype(int)
918
+ else:
919
+ encoded = cat_encoder.transform(x_copy[self.cat_features]).astype("category")
920
+ x_copy[self.cat_features] = encoded
921
+
869
922
  shap_matrix = estimator.predict(
870
- x,
923
+ x_copy,
871
924
  predict_disable_shape_check=True,
872
925
  raw_score=True,
873
926
  pred_leaf=False,
@@ -926,10 +979,10 @@ class OtherEstimatorWrapper(EstimatorWrapper):
926
979
  num_features = [col for col in x.columns if col not in self.cat_features]
927
980
  x[num_features] = x[num_features].fillna(-999)
928
981
  if self.cat_features:
929
- encoder = CatBoostEncoder(random_state=DEFAULT_RANDOM_STATE, return_df=True)
930
- encoded = encoder.fit_transform(x[self.cat_features].astype("object"), y_numpy).astype("category")
931
- x[self.cat_features] = encoded
932
- self.cat_encoder = encoder
982
+ for c in self.cat_features:
983
+ if x[c].dtype != "category":
984
+ x[c] = x[c].astype("category")
985
+ params["cat_features"] = self.cat_features
933
986
  for c in x.columns:
934
987
  if x[c].dtype not in ["category", "int64", "float64", "bool"]:
935
988
  self.logger.warning(f"Feature {c} is not numeric and will be dropped")
@@ -940,15 +993,22 @@ class OtherEstimatorWrapper(EstimatorWrapper):
940
993
  def _prepare_to_calculate(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
941
994
  x, y_numpy, params = super()._prepare_to_calculate(x, y)
942
995
  if self.cat_features is not None:
996
+ for c in self.cat_features:
997
+ if x[c].dtype != "category":
998
+ x[c] = x[c].astype("category")
943
999
  num_features = [col for col in x.columns if col not in self.cat_features]
944
- x[num_features] = x[num_features].fillna(-999)
945
- if self.cat_features and self.cat_encoder is not None:
946
- x[self.cat_features] = self.cat_encoder.transform(
947
- x[self.cat_features].astype("object"), y_numpy
948
- ).astype("category")
1000
+ else:
1001
+ num_features = x.columns
1002
+ x[num_features] = x[num_features].fillna(-999)
1003
+
949
1004
  return x, y_numpy, params
950
1005
 
951
1006
 
1007
+ @runtime_checkable
1008
+ class HasTransform(Protocol):
1009
+ def transform(self, X: pd.DataFrame, y: Optional[Union[pd.Series, np.ndarray]] = None) -> pd.DataFrame: ...
1010
+
1011
+
952
1012
  def validate_scoring_argument(scoring: Union[Callable, str, None]):
953
1013
  if scoring is None:
954
1014
  return
@@ -41,6 +41,7 @@ class DateTimeSearchKeyConverter:
41
41
  date_format: Optional[str] = None,
42
42
  logger: Optional[logging.Logger] = None,
43
43
  bundle: Optional[ResourceBundle] = None,
44
+ generate_cyclical_features: bool = True,
44
45
  ):
45
46
  self.date_column = date_column
46
47
  self.date_format = date_format
@@ -51,6 +52,7 @@ class DateTimeSearchKeyConverter:
51
52
  self.logger.setLevel("FATAL")
52
53
  self.generated_features: List[str] = []
53
54
  self.bundle = bundle or get_custom_bundle()
55
+ self.generate_cyclical_features = generate_cyclical_features
54
56
  self.has_old_dates = False
55
57
 
56
58
  @staticmethod
@@ -121,61 +123,63 @@ class DateTimeSearchKeyConverter:
121
123
  df[cos_feature] = np.cos(2 * np.pi * df[column] / period)
122
124
  self.generated_features.append(cos_feature)
123
125
 
124
- # df["quarter"] = df[self.date_column].dt.quarter
126
+ if self.generate_cyclical_features:
125
127
 
126
- # # Calculate the start date of the quarter for each timestamp
127
- # df["quarter_start"] = df[self.date_column].dt.to_period("Q").dt.start_time
128
+ df["quarter"] = df[self.date_column].dt.quarter
128
129
 
129
- # # Calculate the day in the quarter
130
- # df["day_in_quarter"] = (df[self.date_column] - df["quarter_start"]).dt.days + 1
130
+ # Calculate the start date of the quarter for each timestamp
131
+ df["quarter_start"] = df[self.date_column].dt.to_period("Q").dt.start_time
131
132
 
132
- # # Vectorized calculation of days_in_quarter
133
- # quarter = df["quarter"]
134
- # start = df["quarter_start"]
135
- # year = start.dt.year
136
- # month = start.dt.month
133
+ # Calculate the day in the quarter
134
+ df["day_in_quarter"] = (df[self.date_column] - df["quarter_start"]).dt.days + 1
137
135
 
138
- # quarter_end_year = np.where(quarter == 4, year + 1, year)
139
- # quarter_end_month = np.where(quarter == 4, 1, month + 3)
136
+ # Vectorized calculation of days_in_quarter
137
+ quarter = df["quarter"]
138
+ start = df["quarter_start"]
139
+ year = start.dt.year
140
+ month = start.dt.month
140
141
 
141
- # end = pd.to_datetime({"year": quarter_end_year, "month": quarter_end_month, "day": 1})
142
- # end.index = df.index
142
+ quarter_end_year = np.where(quarter == 4, year + 1, year)
143
+ quarter_end_month = np.where(quarter == 4, 1, month + 3)
143
144
 
144
- # df["days_in_quarter"] = (end - start).dt.days
145
+ end = pd.to_datetime({"year": quarter_end_year, "month": quarter_end_month, "day": 1})
146
+ end.index = df.index
145
147
 
146
- # add_cyclical_features(df, "day_in_quarter", df["days_in_quarter"]) # Days in the quarter
148
+ df["days_in_quarter"] = (end - start).dt.days
147
149
 
148
- # df.drop(columns=["quarter", "quarter_start", "day_in_quarter", "days_in_quarter"], inplace=True)
150
+ add_cyclical_features(df, "day_in_quarter", df["days_in_quarter"]) # Days in the quarter
149
151
 
150
- df[seconds] = (df[self.date_column] - df[self.date_column].dt.floor("D")).dt.seconds
152
+ df.drop(columns=["quarter", "quarter_start", "day_in_quarter", "days_in_quarter"], inplace=True)
151
153
 
152
- seconds_without_na = df[seconds].dropna()
153
- if (seconds_without_na != 0).any() and seconds_without_na.nunique() > 1:
154
- self.logger.info("Time found in date search key. Add extra features based on time")
154
+ df[seconds] = (df[self.date_column] - df[self.date_column].dt.floor("D")).dt.seconds
155
155
 
156
- # Extract basic components
157
- df["second"] = df[self.date_column].dt.second
158
- df["minute"] = df[self.date_column].dt.minute
159
- df["hour"] = df[self.date_column].dt.hour
156
+ seconds_without_na = df[seconds].dropna()
157
+ if (seconds_without_na != 0).any() and seconds_without_na.nunique() > 1:
158
+ self.logger.info("Time found in date search key. Add extra features based on time")
160
159
 
161
- # Apply cyclical transformations
162
- add_cyclical_features(df, "second", 60) # Seconds in a minute
163
- add_cyclical_features(df, "minute", 60) # Minutes in an hour
164
- add_cyclical_features(df, "minute", 30) # Minutes in half an hour
165
- add_cyclical_features(df, "hour", 24) # Hours in a day
160
+ # Extract basic components
161
+ df["second"] = df[self.date_column].dt.second
162
+ df["minute"] = df[self.date_column].dt.minute
163
+ df["hour"] = df[self.date_column].dt.hour
166
164
 
167
- # Drop intermediate columns if not needed
168
- df.drop(columns=["second", "minute", "hour"], inplace=True)
169
- else:
170
- keep_time = False
165
+ # Apply cyclical transformations
166
+ add_cyclical_features(df, "second", 60) # Seconds in a minute
167
+ add_cyclical_features(df, "minute", 60) # Minutes in an hour
168
+ add_cyclical_features(df, "minute", 30) # Minutes in half an hour
169
+ add_cyclical_features(df, "hour", 24) # Hours in a day
170
+
171
+ # Drop intermediate columns if not needed
172
+ df.drop(columns=["second", "minute", "hour"], inplace=True)
173
+ else:
174
+ keep_time = False
171
175
 
172
- for generated_feature in self.generated_features[:]:
173
- if df[generated_feature].dropna().nunique() <= 1:
174
- self.logger.warning(f"Generated constant feature {generated_feature} will be dropped")
175
- df.drop(columns=generated_feature, inplace=True)
176
- self.generated_features.remove(generated_feature)
176
+ for generated_feature in self.generated_features[:]:
177
+ if df[generated_feature].dropna().nunique() <= 1:
178
+ self.logger.warning(f"Generated constant feature {generated_feature} will be dropped")
179
+ df.drop(columns=generated_feature, inplace=True)
180
+ self.generated_features.remove(generated_feature)
177
181
 
178
- df.drop(columns=seconds, inplace=True)
182
+ df.drop(columns=seconds, inplace=True)
179
183
 
180
184
  if keep_time:
181
185
  df[self.DATETIME_COL] = df[self.date_column].astype(np.int64) // 1_000_000
@@ -247,99 +251,107 @@ def is_time_series(df: pd.DataFrame, date_col: str) -> bool:
247
251
 
248
252
 
249
253
  def is_blocked_time_series(df: pd.DataFrame, date_col: str, search_keys: List[str]) -> bool:
250
- df = df.copy()
251
- seconds = "datetime_seconds"
252
- if isinstance(df[date_col].dtype, pd.PeriodDtype):
253
- df[date_col] = df[date_col].dt.to_timestamp()
254
- else:
255
- df[date_col] = pd.to_datetime(df[date_col])
256
- df[date_col] = df[date_col].dt.tz_localize(None)
257
- df[seconds] = (df[date_col] - df[date_col].dt.floor("D")).dt.seconds
258
-
259
- seconds_without_na = df[seconds].dropna()
260
- columns_to_drop = [c for c in search_keys if c != date_col] + [seconds]
261
- df.drop(columns=columns_to_drop, inplace=True)
262
- # Date, not datetime
263
- if (seconds_without_na != 0).any() and seconds_without_na.nunique() > 1:
264
- return False
254
+ try:
255
+ df = df.copy()
256
+ seconds = "datetime_seconds"
257
+ if isinstance(df[date_col].dtype, pd.PeriodDtype):
258
+ df[date_col] = df[date_col].dt.to_timestamp()
259
+ elif is_numeric_dtype(df[date_col]):
260
+ df[date_col] = pd.to_datetime(df[date_col], unit="ms")
261
+ else:
262
+ df[date_col] = pd.to_datetime(df[date_col])
263
+ df[date_col] = df[date_col].dt.tz_localize(None)
264
+ df[seconds] = (df[date_col] - df[date_col].dt.floor("D")).dt.seconds
265
265
 
266
- nunique_dates = df[date_col].nunique()
267
- # Unique dates count more than 270
268
- if nunique_dates < 270:
269
- return False
266
+ seconds_without_na = df[seconds].dropna()
267
+ columns_to_drop = [c for c in search_keys if c != date_col] + [seconds]
268
+ df.drop(columns=columns_to_drop, inplace=True)
269
+ # Date, not datetime
270
+ if (seconds_without_na != 0).any() and seconds_without_na.nunique() > 1:
271
+ return False
270
272
 
271
- min_date = df[date_col].min()
272
- max_date = df[date_col].max()
273
- days_delta = (max_date - min_date).days + 1
274
- # Missing dates less than 30% (unique dates count and days delta between earliest and latest dates)
275
- if nunique_dates / days_delta < 0.3:
276
- return False
273
+ nunique_dates = df[date_col].nunique()
274
+ # Unique dates count more than 270
275
+ if nunique_dates < 270:
276
+ return False
277
+
278
+ min_date = df[date_col].min()
279
+ max_date = df[date_col].max()
280
+ days_delta = (max_date - min_date).days + 1
281
+ # Missing dates less than 30% (unique dates count and days delta between earliest and latest dates)
282
+ if nunique_dates / days_delta < 0.3:
283
+ return False
277
284
 
278
- accumulated_changing_columns = set()
285
+ accumulated_changing_columns = set()
279
286
 
280
- def check_differences(group: pd.DataFrame):
281
- changing_columns = group.columns[group.nunique(dropna=False) > 1].to_list()
282
- accumulated_changing_columns.update(changing_columns)
287
+ def check_differences(group: pd.DataFrame):
288
+ changing_columns = group.columns[group.nunique(dropna=False) > 1].to_list()
289
+ accumulated_changing_columns.update(changing_columns)
283
290
 
284
- def is_multiple_rows(group: pd.DataFrame) -> bool:
285
- return group.shape[0] > 1
291
+ def is_multiple_rows(group: pd.DataFrame) -> bool:
292
+ return group.shape[0] > 1
286
293
 
287
- grouped = df.groupby(date_col)[[c for c in df.columns if c != date_col]]
288
- dates_with_multiple_rows = grouped.apply(is_multiple_rows).sum()
294
+ grouped = df.groupby(date_col)[[c for c in df.columns if c != date_col]]
295
+ dates_with_multiple_rows = grouped.apply(is_multiple_rows).sum()
289
296
 
290
- # share of dates with more than one record is more than 99%
291
- if dates_with_multiple_rows / nunique_dates < 0.99:
292
- return False
297
+ # share of dates with more than one record is more than 99%
298
+ if dates_with_multiple_rows / nunique_dates < 0.99:
299
+ return False
293
300
 
294
- if df.shape[1] <= 3:
295
- return True
301
+ if df.shape[1] <= 3:
302
+ return True
296
303
 
297
- grouped.apply(check_differences)
298
- return len(accumulated_changing_columns) <= 2
304
+ grouped.apply(check_differences)
305
+ return len(accumulated_changing_columns) <= 2
306
+ except Exception:
307
+ return False
299
308
 
300
309
 
301
310
  def is_dates_distribution_valid(
302
311
  df: pd.DataFrame,
303
312
  search_keys: Dict[str, SearchKey],
304
313
  ) -> bool:
305
- maybe_date_col = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
314
+ try:
315
+ maybe_date_col = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
306
316
 
307
- if EVAL_SET_INDEX in df.columns:
308
- X = df.query(f"{EVAL_SET_INDEX} == 0")
309
- else:
310
- X = df
317
+ if EVAL_SET_INDEX in df.columns:
318
+ X = df.query(f"{EVAL_SET_INDEX} == 0")
319
+ else:
320
+ X = df
311
321
 
312
- if maybe_date_col is None:
313
- for col in X.columns:
314
- if col in search_keys:
315
- continue
316
- try:
317
- if isinstance(X[col].dtype, pd.PeriodDtype):
322
+ if maybe_date_col is None:
323
+ for col in X.columns:
324
+ if col in search_keys:
325
+ continue
326
+ try:
327
+ if isinstance(X[col].dtype, pd.PeriodDtype):
328
+ pass
329
+ elif pd.__version__ >= "2.0.0":
330
+ # Format mixed to avoid massive warnings
331
+ pd.to_datetime(X[col], format="mixed")
332
+ else:
333
+ pd.to_datetime(X[col])
334
+ maybe_date_col = col
335
+ break
336
+ except Exception:
318
337
  pass
319
- elif pd.__version__ >= "2.0.0":
320
- # Format mixed to avoid massive warnings
321
- pd.to_datetime(X[col], format="mixed")
322
- else:
323
- pd.to_datetime(X[col])
324
- maybe_date_col = col
325
- break
326
- except Exception:
327
- pass
328
-
329
- if maybe_date_col is None:
330
- return
331
-
332
- if isinstance(X[maybe_date_col].dtype, pd.PeriodDtype):
333
- dates = X[maybe_date_col].dt.to_timestamp().dt.date
334
- elif pd.__version__ >= "2.0.0":
335
- dates = pd.to_datetime(X[maybe_date_col], format="mixed").dt.date
336
- else:
337
- dates = pd.to_datetime(X[maybe_date_col]).dt.date
338
-
339
- date_counts = dates.value_counts().sort_index()
340
-
341
- date_counts_1 = date_counts[: round(len(date_counts) / 2)]
342
- date_counts_2 = date_counts[round(len(date_counts) / 2) :]
343
- ratio = date_counts_2.mean() / date_counts_1.mean()
344
-
345
- return ratio >= 0.8 and ratio <= 1.2
338
+
339
+ if maybe_date_col is None:
340
+ return
341
+
342
+ if isinstance(X[maybe_date_col].dtype, pd.PeriodDtype):
343
+ dates = X[maybe_date_col].dt.to_timestamp().dt.date
344
+ elif pd.__version__ >= "2.0.0":
345
+ dates = pd.to_datetime(X[maybe_date_col], format="mixed").dt.date
346
+ else:
347
+ dates = pd.to_datetime(X[maybe_date_col]).dt.date
348
+
349
+ date_counts = dates.value_counts().sort_index()
350
+
351
+ date_counts_1 = date_counts[: round(len(date_counts) / 2)]
352
+ date_counts_2 = date_counts[round(len(date_counts) / 2) :]
353
+ ratio = date_counts_2.mean() / date_counts_1.mean()
354
+
355
+ return ratio >= 0.8 and ratio <= 1.2
356
+ except Exception:
357
+ return False
@@ -104,9 +104,9 @@ def remove_fintech_duplicates(
104
104
  sub_df = pd.merge(sub_df, nonunique_target_rows, on=personal_cols)
105
105
 
106
106
  # Convert date columns for further checks
107
- sub_df = DateTimeSearchKeyConverter(date_col, date_format=date_format, logger=logger, bundle=bundle).convert(
108
- sub_df
109
- )
107
+ sub_df = DateTimeSearchKeyConverter(
108
+ date_col, date_format=date_format, logger=logger, bundle=bundle, generate_cyclical_features=False
109
+ ).convert(sub_df)
110
110
  grouped_by_personal_cols = sub_df.groupby(personal_cols, group_keys=False)
111
111
  rows_with_diff_target = grouped_by_personal_cols.filter(has_diff_target_within_60_days)
112
112
 
@@ -36,11 +36,11 @@ class EmailDomainGenerator:
36
36
  self.generated_features = []
37
37
 
38
38
  def generate(self, df: pd.DataFrame) -> pd.DataFrame:
39
- # for email_col in self.email_columns:
40
- # domain_feature = email_col + self.DOMAIN_SUFFIX
41
- # if domain_feature not in df.columns:
42
- # df[domain_feature] = df[email_col].apply(self._email_to_domain).astype("string")
43
- # self.generated_features.append(domain_feature)
39
+ for email_col in self.email_columns:
40
+ domain_feature = email_col + self.DOMAIN_SUFFIX
41
+ if domain_feature not in df.columns:
42
+ df[domain_feature] = df[email_col].apply(self._email_to_domain).astype("string")
43
+ self.generated_features.append(domain_feature)
44
44
  return df
45
45
 
46
46
  @staticmethod
@@ -1,4 +1,5 @@
1
1
  import functools
2
+ import inspect
2
3
  import numbers
3
4
  import time
4
5
  import warnings
@@ -9,6 +10,7 @@ from traceback import format_exc
9
10
 
10
11
  import numpy as np
11
12
  import scipy.sparse as sp
13
+ from category_encoders import CatBoostEncoder
12
14
  from joblib import Parallel, logger
13
15
  from scipy.sparse import issparse
14
16
  from sklearn import config_context, get_config
@@ -16,10 +18,13 @@ from sklearn.base import clone, is_classifier
16
18
  from sklearn.exceptions import FitFailedWarning, NotFittedError
17
19
  from sklearn.metrics import check_scoring
18
20
  from sklearn.metrics._scorer import _MultimetricScorer
19
- from sklearn.model_selection import StratifiedKFold, check_cv
21
+ from sklearn.model_selection import StratifiedKFold, TimeSeriesSplit, check_cv
22
+ from sklearn.preprocessing import OrdinalEncoder
20
23
  from sklearn.utils.fixes import np_version, parse_version
21
24
  from sklearn.utils.validation import indexable
22
25
 
26
+ from upgini.utils.blocked_time_series import BlockedTimeSeriesSplit
27
+
23
28
  # from sklearn.model_selection import cross_validate as original_cross_validate
24
29
 
25
30
  _DEFAULT_TAGS = {
@@ -59,6 +64,7 @@ def cross_validate(
59
64
  return_train_score=False,
60
65
  return_estimator=False,
61
66
  error_score=np.nan,
67
+ random_state=None,
62
68
  ):
63
69
  """Evaluate metric(s) by cross-validation and also record fit/score times.
64
70
 
@@ -279,6 +285,8 @@ def cross_validate(
279
285
  return_times=True,
280
286
  return_estimator=return_estimator,
281
287
  error_score=error_score,
288
+ is_timeseries=isinstance(cv, TimeSeriesSplit) or isinstance(cv, BlockedTimeSeriesSplit),
289
+ random_state=random_state,
282
290
  )
283
291
  for train, test in cv.split(x, y, groups)
284
292
  )
@@ -296,6 +304,7 @@ def cross_validate(
296
304
  ret = {}
297
305
  ret["fit_time"] = results["fit_time"]
298
306
  ret["score_time"] = results["score_time"]
307
+ ret["cat_encoder"] = results["cat_encoder"]
299
308
 
300
309
  if return_estimator:
301
310
  ret["estimator"] = results["estimator"]
@@ -320,16 +329,16 @@ def cross_validate(
320
329
  else:
321
330
  shuffle = False
322
331
  if hasattr(cv, "random_state") and shuffle:
323
- random_state = cv.random_state
332
+ cv_random_state = cv.random_state
324
333
  else:
325
- random_state = None
334
+ cv_random_state = None
326
335
  return cross_validate(
327
336
  estimator,
328
337
  x,
329
338
  y,
330
339
  groups=groups,
331
340
  scoring=scoring,
332
- cv=StratifiedKFold(n_splits=cv.get_n_splits(), shuffle=shuffle, random_state=random_state),
341
+ cv=StratifiedKFold(n_splits=cv.get_n_splits(), shuffle=shuffle, random_state=cv_random_state),
333
342
  n_jobs=n_jobs,
334
343
  verbose=verbose,
335
344
  fit_params=fit_params,
@@ -337,21 +346,46 @@ def cross_validate(
337
346
  return_train_score=return_train_score,
338
347
  return_estimator=return_estimator,
339
348
  error_score=error_score,
349
+ random_state=random_state,
340
350
  )
341
351
  raise e
342
352
 
343
353
 
344
- def is_catboost_estimator(estimator):
354
+ def _is_catboost_estimator(estimator):
345
355
  try:
346
356
  from catboost import CatBoostClassifier, CatBoostRegressor
357
+
347
358
  return isinstance(estimator, (CatBoostClassifier, CatBoostRegressor))
348
359
  except ImportError:
349
360
  return False
350
361
 
351
362
 
352
- def is_lightgbm_estimator(estimator):
363
+ def _supports_cat_features(estimator) -> bool:
364
+ """Check if estimator's fit method accepts cat_features parameter.
365
+
366
+ Parameters
367
+ ----------
368
+ estimator : estimator object
369
+ The estimator to check.
370
+
371
+ Returns
372
+ -------
373
+ bool
374
+ True if estimator's fit method accepts cat_features parameter, False otherwise.
375
+ """
376
+ try:
377
+ # Get the signature of the fit method
378
+ fit_params = inspect.signature(estimator.fit).parameters
379
+ # Check if cat_features is in the parameters
380
+ return "cat_features" in fit_params
381
+ except (AttributeError, ValueError):
382
+ return False
383
+
384
+
385
+ def _is_lightgbm_estimator(estimator):
353
386
  try:
354
387
  from lightgbm import LGBMClassifier, LGBMRegressor
388
+
355
389
  return isinstance(estimator, (LGBMClassifier, LGBMRegressor))
356
390
  except ImportError:
357
391
  return False
@@ -375,6 +409,8 @@ def _fit_and_score(
375
409
  split_progress=None,
376
410
  candidate_progress=None,
377
411
  error_score=np.nan,
412
+ is_timeseries=False,
413
+ random_state=None,
378
414
  ):
379
415
  """Fit estimator and compute scores for a given dataset split.
380
416
 
@@ -509,13 +545,24 @@ def _fit_and_score(
509
545
 
510
546
  result = {}
511
547
  try:
548
+ if "cat_features" in fit_params and fit_params["cat_features"]:
549
+ X_train, y_train, X_test, y_test, cat_features, cat_encoder = _encode_cat_features(
550
+ X_train, y_train, X_test, y_test, fit_params["cat_features"], estimator, is_timeseries, random_state
551
+ )
552
+ if cat_features and _supports_cat_features(estimator):
553
+ fit_params["cat_features"] = cat_features
554
+ else:
555
+ del fit_params["cat_features"]
556
+ else:
557
+ cat_encoder = None
558
+ result["cat_encoder"] = cat_encoder
512
559
  if y_train is None:
513
560
  estimator.fit(X_train, **fit_params)
514
561
  else:
515
- if is_catboost_estimator(estimator):
562
+ if _is_catboost_estimator(estimator):
516
563
  fit_params = fit_params.copy()
517
564
  fit_params["eval_set"] = [(X_test, y_test)]
518
- elif is_lightgbm_estimator(estimator):
565
+ elif _is_lightgbm_estimator(estimator):
519
566
  fit_params = fit_params.copy()
520
567
  fit_params["eval_set"] = [(X_test, y_test)]
521
568
  estimator.fit(X_train, y_train, **fit_params)
@@ -1245,3 +1292,60 @@ def _num_samples(x):
1245
1292
  return len(x)
1246
1293
  except TypeError as type_error:
1247
1294
  raise TypeError(message) from type_error
1295
+
1296
+
1297
+ def _encode_cat_features(X_train, y_train, X_test, y_test, cat_features, estimator, is_timeseries, random_state):
1298
+ if _is_catboost_estimator(estimator):
1299
+ if is_timeseries:
1300
+ # Fit encoder on training fold
1301
+ encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
1302
+ encoder.fit(X_train[cat_features], y_train)
1303
+
1304
+ X_train[cat_features] = encoder.transform(X_train[cat_features]).astype(int)
1305
+ X_test[cat_features] = encoder.transform(X_test[cat_features]).astype(int)
1306
+
1307
+ # Don't use as categorical features, so CatBoost will not encode them
1308
+ return X_train, y_train, X_test, y_test, [], encoder
1309
+ else:
1310
+ return X_train, y_train, X_test, y_test, cat_features, None
1311
+ else:
1312
+ if is_timeseries:
1313
+ # Fit encoder on training fold
1314
+ encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
1315
+ encoder.fit(X_train[cat_features], y_train)
1316
+
1317
+ # Progressive encoding on train (using y)
1318
+ X_train[cat_features] = encoder.transform(X_train[cat_features], y_train).astype(int)
1319
+
1320
+ # Static encoding on validation (no y)
1321
+ X_test[cat_features] = encoder.transform(X_test[cat_features]).astype(int)
1322
+
1323
+ return X_train, y_train, X_test, y_test, [], encoder
1324
+ else:
1325
+ # Shuffle train data
1326
+ X_train_shuffled, y_train_shuffled = _shuffle_pair(
1327
+ X_train[cat_features].astype("object"), y_train, random_state
1328
+ )
1329
+
1330
+ # Fit encoder on training fold
1331
+ encoder = CatBoostEncoder(random_state=random_state, cols=cat_features)
1332
+ encoder.fit(X_train_shuffled, y_train_shuffled)
1333
+
1334
+ # Progressive encoding on train (using y)
1335
+ X_train[cat_features] = encoder.transform(X_train[cat_features], y_train).astype("category")
1336
+
1337
+ # Static encoding on validation (no y)
1338
+ X_test[cat_features] = encoder.transform(X_test[cat_features]).astype("category")
1339
+
1340
+ return X_train, y_train, X_test, y_test, cat_features, encoder
1341
+
1342
+
1343
+ def _shuffle_pair(X, y, random_state):
1344
+ # If X doesn't have reseted index there could be a problem
1345
+ # shuffled_idx = np.random.RandomState(random_state).permutation(len(X))
1346
+ # return X.iloc[shuffled_idx], pd.Series(y).iloc[shuffled_idx]
1347
+
1348
+ Xy = X.copy()
1349
+ Xy["target"] = y
1350
+ Xy_shuffled = Xy.sample(frac=1, random_state=random_state)
1351
+ return Xy_shuffled.drop(columns="target"), Xy_shuffled["target"]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.86.dev1
3
+ Version: 1.2.87.dev3
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -1,12 +1,12 @@
1
- upgini/__about__.py,sha256=x1iyqkRuCxNu5kTIKv8yNfzxxa0JD4GnBFMpKHM2wRM,28
1
+ upgini/__about__.py,sha256=-MoNpjvEXC0uIle8xxIgQduzBZJlNzuW-1rPMTm_xc8,28
2
2
  upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
3
3
  upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
4
4
  upgini/dataset.py,sha256=fRtqSkXNONLnPe6cCL967GMt349FTIpXzy_u8LUKncw,35354
5
5
  upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
6
- upgini/features_enricher.py,sha256=G0qbRPdlWe9p6cwYF3khP99-0kgAO8N0A2sfQxSLgmM,213446
6
+ upgini/features_enricher.py,sha256=n8KBoBgJApLiRv4wXeSgfS-PfbB1D5aDOJfFnL0q6v8,214487
7
7
  upgini/http.py,sha256=6Qcepv0tDC72mBBJxYHnA2xqw6QwFaKrXN8o4vju8Es,44372
8
8
  upgini/metadata.py,sha256=zt_9k0iQbWXuiRZcel4ORNPdQKt6Ou69ucZD_E1Q46o,12341
9
- upgini/metrics.py,sha256=3cip0_L6-OFew74KsRwzxJDU6UFq05h2v7IsyHLcMRc,43164
9
+ upgini/metrics.py,sha256=CR_MKBcq1RlNMXeqc9S374JzHgunMl-mEmlTnZAm_VI,45236
10
10
  upgini/search_task.py,sha256=Q5HjBpLIB3OCxAD1zNv5yQ3ZNJx696WCK_-H35_y7Rs,17912
11
11
  upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
12
12
  upgini/version_validator.py,sha256=DvbaAvuYFoJqYt0fitpsk6Xcv-H1BYDJYHUMxaKSH_Y,1509
@@ -51,10 +51,10 @@ upgini/utils/blocked_time_series.py,sha256=Uqr3vp4YqNclj2-PzEYqVy763GSXHn86sbpIl
51
51
  upgini/utils/country_utils.py,sha256=lY-eXWwFVegdVENFttbvLcgGDjFO17Sex8hd2PyJaRk,6937
52
52
  upgini/utils/custom_loss_utils.py,sha256=kieNZYBYZm5ZGBltF1F_jOSF4ea6C29rYuCyiDcqVNY,3857
53
53
  upgini/utils/cv_utils.py,sha256=w6FQb9nO8BWDx88EF83NpjPLarK4eR4ia0Wg0kLBJC4,3525
54
- upgini/utils/datetime_utils.py,sha256=FKeCc5PQnhMSyLiw8nuiMccmMkrUCj4zCIgpZnffpbU,13569
55
- upgini/utils/deduplicate_utils.py,sha256=AcMLoObMjhOTQ_fMS1LWy0GKp6WXnZ-FNux_8V3nbZU,8914
54
+ upgini/utils/datetime_utils.py,sha256=UL1ernnawW0LV9mPDpCIc6sFy0HUhFscWVNwfH4V7rI,14366
55
+ upgini/utils/deduplicate_utils.py,sha256=jm9ARZ0fbJFF3aJqj-xm_T6lNh-WErM0H0h6B_L1xQc,8948
56
56
  upgini/utils/display_utils.py,sha256=hAeWEcJtPDg8fAVcMNrNB-azFD2WJp1nvbPAhR7SeP4,12071
57
- upgini/utils/email_utils.py,sha256=TZ_2UL0T7rzXG5WNu3dLUReY15qt6PozEGY_4cyuhdM,5287
57
+ upgini/utils/email_utils.py,sha256=pZ2vCfNxLIPUhxr0-OlABNXm12jjU44isBk8kGmqQzA,5277
58
58
  upgini/utils/fallback_progress_bar.py,sha256=PDaKb8dYpVZaWMroNcOHsTc3pSjgi9mOm0--cOFTwJ0,1074
59
59
  upgini/utils/feature_info.py,sha256=Q9HN6A-fvfVD-irFWrmOqqZG9RsUSvh5MTY_k0xu-tE,7287
60
60
  upgini/utils/features_validator.py,sha256=lEfmk4DoxZ4ooOE1HC0ZXtUb_lFKRFHIrnFULZ4_rL8,3746
@@ -64,13 +64,13 @@ upgini/utils/mstats.py,sha256=u3gQVUtDRbyrOQK6V1UJ2Rx1QbkSNYGjXa6m3Z_dPVs,6286
64
64
  upgini/utils/phone_utils.py,sha256=IrbztLuOJBiePqqxllfABWfYlfAjYevPhXKipl95wUI,10432
65
65
  upgini/utils/postal_code_utils.py,sha256=5M0sUqH2DAr33kARWCTXR-ACyzWbjDq_-0mmEml6ZcU,1716
66
66
  upgini/utils/progress_bar.py,sha256=N-Sfdah2Hg8lXP_fV9EfUTXz_PyRt4lo9fAHoUDOoLc,1550
67
- upgini/utils/sklearn_ext.py,sha256=HpaNQaKJisgNE7IZ71n7uswxTj7kbPglU2G3s1sORAc,45042
67
+ upgini/utils/sklearn_ext.py,sha256=Mdxz0tc-9zT4QyNccA3B86fY4l0MnLDr94POVdYeCT4,49332
68
68
  upgini/utils/sort.py,sha256=8uuHs2nfSMVnz8GgvbOmgMB1PgEIZP1uhmeRFxcwnYw,7039
69
69
  upgini/utils/target_utils.py,sha256=LRN840dzx78-wg7ftdxAkp2c1eu8-JDvkACiRThm4HE,16832
70
70
  upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
71
71
  upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
72
72
  upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
73
- upgini-1.2.86.dev1.dist-info/METADATA,sha256=WbxVPEQbJJMxYSDRTiJAdevnfltYEQ8WjxyGgVv7vaE,49167
74
- upgini-1.2.86.dev1.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
75
- upgini-1.2.86.dev1.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
76
- upgini-1.2.86.dev1.dist-info/RECORD,,
73
+ upgini-1.2.87.dev3.dist-info/METADATA,sha256=Pm-acVK8TpDLvPsO0qluwSjmu0cb3FHmtXmqMj--2Ag,49167
74
+ upgini-1.2.87.dev3.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
75
+ upgini-1.2.87.dev3.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
76
+ upgini-1.2.87.dev3.dist-info/RECORD,,