upgini 1.1.246a101__py3-none-any.whl → 1.1.248a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

upgini/autofe/operand.py CHANGED
@@ -59,12 +59,14 @@ class PandasOperand(Operand, abc.ABC):
59
59
  df_from.loc[np.nan] = np.nan
60
60
  return df_to.fillna(np.nan).apply(lambda x: df_from.loc[x])
61
61
 
62
- def _round_value(self, value: Union[pd.Series, pd.DataFrame]) -> Union[pd.Series, pd.DataFrame]:
62
+ def _round_value(
63
+ self, value: Union[pd.Series, pd.DataFrame], precision: Optional[int] = None
64
+ ) -> Union[pd.Series, pd.DataFrame]:
63
65
  if isinstance(value, pd.DataFrame):
64
66
  return value.apply(self._round_value, axis=1)
65
67
 
66
68
  if np.issubdtype(value.dtype, np.floating):
67
- precision = np.finfo(value.dtype).precision
69
+ precision = precision or np.finfo(value.dtype).precision
68
70
  return np.trunc(value * 10**precision) / (10**precision)
69
71
  else:
70
72
  return value
upgini/autofe/unary.py CHANGED
@@ -22,10 +22,10 @@ class Log(PandasOperand):
22
22
  output_type = "float"
23
23
 
24
24
  def calculate_unary(self, data: pd.Series) -> pd.Series:
25
- return self._round_value(np.log(np.abs(data.replace(0, np.nan))))
25
+ return self._round_value(np.log(np.abs(data.replace(0, np.nan))), 10)
26
26
 
27
27
  def calculate_group(self, data: pd.DataFrame, **kwargs) -> pd.DataFrame:
28
- return self._round_value(np.log(data.replace(0, np.nan).abs()))
28
+ return self._round_value(np.log(data.replace(0, np.nan).abs()), 10)
29
29
 
30
30
 
31
31
  class Sqrt(PandasOperand):
upgini/dataset.py CHANGED
@@ -662,15 +662,15 @@ class Dataset: # (pd.DataFrame):
662
662
  # if self.task_type != ModelTaskType.MULTICLASS:
663
663
  # self.data[target] = self.data[target].apply(pd.to_numeric, errors="coerce")
664
664
 
665
- keys_to_validate = [
665
+ keys_to_validate = {
666
666
  key
667
667
  for search_group in self.search_keys_checked
668
668
  for key in search_group
669
669
  if self.columns_renaming.get(key) != EmailSearchKeyConverter.EMAIL_ONE_DOMAIN_COLUMN_NAME
670
- ]
671
- ipv4_column = self.etalon_def_checked.get(FileColumnMeaningType.IP_ADDRESS)
670
+ }
671
+ ipv4_column = self.etalon_def_checked.get(FileColumnMeaningType.IP_ADDRESS.value)
672
672
  if (
673
- FileColumnMeaningType.IPV6_ADDRESS in self.etalon_def_checked
673
+ FileColumnMeaningType.IPV6_ADDRESS.value in self.etalon_def_checked
674
674
  and ipv4_column is not None
675
675
  and ipv4_column in keys_to_validate
676
676
  ):
@@ -1,3 +1,4 @@
1
+ import dataclasses
1
2
  import gc
2
3
  import hashlib
3
4
  import itertools
@@ -9,8 +10,7 @@ import sys
9
10
  import tempfile
10
11
  import time
11
12
  import uuid
12
- from collections import namedtuple
13
- from functools import reduce
13
+ from dataclasses import dataclass
14
14
  from threading import Thread
15
15
  from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union
16
16
 
@@ -21,6 +21,7 @@ from scipy.stats import ks_2samp
21
21
  from sklearn.base import TransformerMixin
22
22
  from sklearn.exceptions import NotFittedError
23
23
  from sklearn.model_selection import BaseCrossValidator
24
+ from sklearn.model_selection._split import GroupsConsumerMixin
24
25
 
25
26
  from upgini.autofe.feature import Feature
26
27
  from upgini.data_source.data_source_publisher import CommercialSchema
@@ -59,7 +60,7 @@ from upgini.utils.custom_loss_utils import (
59
60
  get_additional_params_custom_loss,
60
61
  get_runtime_params_custom_loss,
61
62
  )
62
- from upgini.utils.cv_utils import CVConfig
63
+ from upgini.utils.cv_utils import CVConfig, get_groups
63
64
  from upgini.utils.datetime_utils import (
64
65
  DateTimeSearchKeyConverter,
65
66
  is_blocked_time_series,
@@ -920,6 +921,7 @@ class FeaturesEnricher(TransformerMixin):
920
921
  fitting_eval_set_dict,
921
922
  search_keys,
922
923
  groups,
924
+ _cv,
923
925
  ) = prepared_data
924
926
 
925
927
  gc.collect()
@@ -936,16 +938,6 @@ class FeaturesEnricher(TransformerMixin):
936
938
 
937
939
  has_date = self._get_date_column(search_keys) is not None
938
940
  model_task_type = self.model_task_type or define_task(y_sorted, has_date, self.logger, silent=True)
939
- _cv = cv or self.cv
940
- if groups is None and _cv == CVType.group_k_fold:
941
- self.logger.info("Replacing group_k_fold with k_fold as no groups were found")
942
- _cv = CVType.k_fold
943
- if not isinstance(_cv, BaseCrossValidator):
944
- date_column = self._get_date_column(search_keys)
945
- date_series = validated_X[date_column] if date_column is not None else None
946
- _cv = CVConfig(
947
- _cv, date_series, self.random_state, self._search_task.get_shuffle_kfold()
948
- ).get_cv()
949
941
 
950
942
  wrapper = EstimatorWrapper.create(
951
943
  estimator,
@@ -1213,7 +1205,7 @@ class FeaturesEnricher(TransformerMixin):
1213
1205
  generated_features = []
1214
1206
  date_column = self._get_date_column(search_keys)
1215
1207
  if date_column is not None:
1216
- converter = DateTimeSearchKeyConverter(date_column, self.date_format, self.logger)
1208
+ converter = DateTimeSearchKeyConverter(date_column, self.date_format, self.logger, self.bundle)
1217
1209
  extended_X = converter.convert(extended_X, keep_time=True)
1218
1210
  generated_features.extend(converter.generated_features)
1219
1211
  email_column = self._get_email_column(search_keys)
@@ -1265,6 +1257,27 @@ class FeaturesEnricher(TransformerMixin):
1265
1257
  self.logger.info("Passed X, y and eval_set that differs from passed on fit. Transform will be used")
1266
1258
  return False, X, y, checked_eval_set
1267
1259
 
1260
+ def _get_cv_and_groups(
1261
+ self,
1262
+ X: pd.DataFrame,
1263
+ cv_override: Union[BaseCrossValidator, CVType, str, None],
1264
+ search_keys: Dict[str, SearchKey],
1265
+ ) -> Tuple[BaseCrossValidator, Optional[np.ndarray]]:
1266
+ _cv = cv_override or self.cv
1267
+ group_columns = sorted(self._get_group_columns(X, search_keys))
1268
+ groups = None
1269
+
1270
+ if not isinstance(_cv, BaseCrossValidator):
1271
+ date_column = self._get_date_column(search_keys)
1272
+ date_series = X[date_column] if date_column is not None else None
1273
+ _cv, groups = CVConfig(
1274
+ _cv, date_series, self.random_state, self._search_task.get_shuffle_kfold(), group_columns=group_columns
1275
+ ).get_cv_and_groups(X)
1276
+ elif isinstance(_cv, GroupsConsumerMixin):
1277
+ groups = get_groups(X, group_columns)
1278
+
1279
+ return _cv, groups
1280
+
1268
1281
  def _prepare_data_for_metrics(
1269
1282
  self,
1270
1283
  trace_id: str,
@@ -1275,6 +1288,7 @@ class FeaturesEnricher(TransformerMixin):
1275
1288
  importance_threshold: Optional[float] = None,
1276
1289
  max_features: Optional[int] = None,
1277
1290
  remove_outliers_calc_metrics: Optional[bool] = None,
1291
+ cv_override: Union[BaseCrossValidator, CVType, str, None] = None,
1278
1292
  search_keys_for_metrics: Optional[List[str]] = None,
1279
1293
  progress_bar: Optional[ProgressBar] = None,
1280
1294
  progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
@@ -1290,7 +1304,7 @@ class FeaturesEnricher(TransformerMixin):
1290
1304
  else None
1291
1305
  )
1292
1306
 
1293
- X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys = self._sample_data_for_metrics(
1307
+ sampled_data = self._sample_data_for_metrics(
1294
1308
  trace_id,
1295
1309
  validated_X,
1296
1310
  validated_y,
@@ -1302,6 +1316,7 @@ class FeaturesEnricher(TransformerMixin):
1302
1316
  progress_bar,
1303
1317
  progress_callback,
1304
1318
  )
1319
+ X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys = dataclasses.astuple(sampled_data)
1305
1320
 
1306
1321
  excluding_search_keys = list(search_keys.keys())
1307
1322
  if search_keys_for_metrics is not None and len(search_keys_for_metrics) > 0:
@@ -1325,14 +1340,7 @@ class FeaturesEnricher(TransformerMixin):
1325
1340
  X_sorted, y_sorted = self._sort_by_system_record_id(X_sampled, y_sampled, self.cv)
1326
1341
  enriched_X_sorted, enriched_y_sorted = self._sort_by_system_record_id(enriched_X, y_sampled, self.cv)
1327
1342
 
1328
- group_columns = sorted(self._get_group_columns(enriched_X_sorted, search_keys))
1329
- groups = (
1330
- None
1331
- if not group_columns or self.cv != CVType.group_k_fold
1332
- else reduce(
1333
- lambda left, right: left + "_" + right, [enriched_X_sorted[c].astype(str) for c in group_columns]
1334
- ).factorize()[0]
1335
- )
1343
+ cv, groups = self._get_cv_and_groups(enriched_X_sorted, cv_override, search_keys)
1336
1344
 
1337
1345
  existing_filtered_enriched_features = [c for c in filtered_enriched_features if c in enriched_X_sorted.columns]
1338
1346
 
@@ -1382,11 +1390,16 @@ class FeaturesEnricher(TransformerMixin):
1382
1390
  fitting_eval_set_dict,
1383
1391
  search_keys,
1384
1392
  groups,
1393
+ cv,
1385
1394
  )
1386
1395
 
1387
- _SampledDataForMetrics = namedtuple(
1388
- "_SampledDataForMetrics", "X_sampled y_sampled enriched_X eval_set_sampled_dict search_keys"
1389
- )
1396
+ @dataclass
1397
+ class _SampledDataForMetrics:
1398
+ X_sampled: pd.DataFrame
1399
+ y_sampled: pd.Series
1400
+ enriched_X: pd.DataFrame
1401
+ eval_set_sampled_dict: Dict[int, Tuple[pd.DataFrame, pd.Series]]
1402
+ search_keys: Dict[str, SearchKey]
1390
1403
 
1391
1404
  def _sample_data_for_metrics(
1392
1405
  self,
@@ -1571,7 +1584,12 @@ class FeaturesEnricher(TransformerMixin):
1571
1584
  df_with_eval_set_index = pd.concat([df_with_eval_set_index, eval_df_with_index])
1572
1585
 
1573
1586
  _, df_with_eval_set_index = remove_fintech_duplicates(
1574
- df_with_eval_set_index, self.search_keys, self.logger, silent=True
1587
+ df_with_eval_set_index,
1588
+ self.search_keys,
1589
+ date_format=self.date_format,
1590
+ logger=self.logger,
1591
+ silent=True,
1592
+ bundle=self.bundle,
1575
1593
  )
1576
1594
 
1577
1595
  # downsample if need to eval_set threshold
@@ -1673,7 +1691,11 @@ class FeaturesEnricher(TransformerMixin):
1673
1691
  ):
1674
1692
  search_keys = {k: v for k, v in search_keys.items() if k in X_sampled.columns.to_list()}
1675
1693
  return FeaturesEnricher._SampledDataForMetrics(
1676
- X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys
1694
+ X_sampled=X_sampled,
1695
+ y_sampled=y_sampled,
1696
+ enriched_X=enriched_X,
1697
+ eval_set_sampled_dict=eval_set_sampled_dict,
1698
+ search_keys=search_keys,
1677
1699
  )
1678
1700
 
1679
1701
  def get_search_id(self) -> Optional[str]:
@@ -1810,7 +1832,7 @@ class FeaturesEnricher(TransformerMixin):
1810
1832
  generated_features = []
1811
1833
  date_column = self._get_date_column(search_keys)
1812
1834
  if date_column is not None:
1813
- converter = DateTimeSearchKeyConverter(date_column, self.date_format, self.logger)
1835
+ converter = DateTimeSearchKeyConverter(date_column, self.date_format, self.logger, bundle=self.bundle)
1814
1836
  df = converter.convert(df)
1815
1837
  self.logger.info(f"Date column after convertion: {df[date_column]}")
1816
1838
  generated_features.extend(converter.generated_features)
@@ -1868,7 +1890,9 @@ class FeaturesEnricher(TransformerMixin):
1868
1890
 
1869
1891
  df_without_features = df.drop(columns=non_keys_columns)
1870
1892
 
1871
- df_without_features = clean_full_duplicates(df_without_features, self.logger, silent=silent_mode)
1893
+ df_without_features = clean_full_duplicates(
1894
+ df_without_features, self.logger, silent=silent_mode, bundle=self.bundle
1895
+ )
1872
1896
 
1873
1897
  del df
1874
1898
  gc.collect()
@@ -2148,9 +2172,11 @@ class FeaturesEnricher(TransformerMixin):
2148
2172
 
2149
2173
  df = self.__add_country_code(df, self.fit_search_keys)
2150
2174
 
2151
- need_full_defuplication, df = remove_fintech_duplicates(df, self.fit_search_keys, self.logger)
2175
+ need_full_defuplication, df = remove_fintech_duplicates(
2176
+ df, self.fit_search_keys, date_format=self.date_format, logger=self.logger, bundle=self.bundle
2177
+ )
2152
2178
  if need_full_defuplication:
2153
- df = clean_full_duplicates(df, self.logger)
2179
+ df = clean_full_duplicates(df, self.logger, bundle=self.bundle)
2154
2180
 
2155
2181
  date_column = self._get_date_column(self.fit_search_keys)
2156
2182
  self.__adjust_cv(df, date_column, model_task_type)
@@ -2158,7 +2184,7 @@ class FeaturesEnricher(TransformerMixin):
2158
2184
  self.fit_generated_features = []
2159
2185
 
2160
2186
  if date_column is not None:
2161
- converter = DateTimeSearchKeyConverter(date_column, self.date_format, self.logger)
2187
+ converter = DateTimeSearchKeyConverter(date_column, self.date_format, self.logger, bundle=self.bundle)
2162
2188
  df = converter.convert(df, keep_time=True)
2163
2189
  self.logger.info(f"Date column after convertion: {df[date_column]}")
2164
2190
  self.fit_generated_features.extend(converter.generated_features)
upgini/metadata.py CHANGED
@@ -306,3 +306,4 @@ class CVType(Enum):
306
306
  group_k_fold = "group_k_fold"
307
307
  time_series = "time_series"
308
308
  blocked_time_series = "blocked_time_series"
309
+ not_set = "not_set"
@@ -89,6 +89,8 @@ empty_search_key=Search key {} is empty. Please fill values or remove this searc
89
89
  single_constant_search_key=\nWARNING: Constant value detected for the {} search key in the X dataframe: {}.\nThat search key will add constant features for different y values.\nPlease add extra search keys with non constant values, like the COUNTRY, POSTAL_CODE, DATE, PHONE NUMBER, EMAIL/HEM or IPv4
90
90
  unsupported_index_column=\nWARNING: Your column with name `index` was dropped because it's reserved name is booked for system needs.
91
91
  date_string_without_format=Date column `{}` has string type, but date_format is not specified. Convert column to datetime type or pass date_format
92
+ invalid_date_format=Failed to parse date in column `{}`. Try to pass explicit date format in date_format argument of FeaturesEnricher constructor
93
+ unsupported_date_type=Unsupported type of date column `{}`. Convert to datetime please.
92
94
  invalid_postal_code=All values of POSTAL_CODE column `{}` are invalid
93
95
  invalid_country=All values of COUNTRY column `{}` are invalid
94
96
  invalid_ip=All values of IPv4 column `{}` are invalid
@@ -9,6 +9,8 @@ dataset_too_few_rows=Labeled dataset size with unique search keys must be not le
9
9
  dataset_too_big_file=Too big size of labeled dataset for processing. Please reduce number of rows or columns
10
10
  dataset_too_many_rows_registered=Labeled dataset size rows limit is {}. Please sample it and retry
11
11
  dataset_all_dates_old=There is empty labeled dataset after removing data before '2000-01-01'
12
+ invalid_date_format=Failed to parse date in column `{}`
13
+ unsupported_date_type=Unsupported type of date column `{}`. Convert to datetime please.
12
14
 
13
15
  # Validate target
14
16
  y_is_constant=Target label is a constant. Relevant feature search requires a non-constant target
upgini/utils/cv_utils.py CHANGED
@@ -1,4 +1,6 @@
1
- from typing import Any, Dict, Optional, Union
1
+ from functools import reduce
2
+ from typing import Any, Dict, List, Optional, Tuple, Union
3
+ import numpy as np
2
4
 
3
5
  import pandas as pd
4
6
  from sklearn.model_selection import BaseCrossValidator, KFold, TimeSeriesSplit, GroupKFold, GroupShuffleSplit
@@ -14,6 +16,9 @@ class CVConfig:
14
16
  date_column: Optional[pd.Series],
15
17
  random_state=None,
16
18
  shuffle_kfold: Optional[bool] = None,
19
+ test_size: Optional[float] = 0.2,
20
+ n_folds: Optional[int] = 5,
21
+ group_columns: Optional[List[str]] = None,
17
22
  ):
18
23
  if cv_type is None:
19
24
  self.cv_type = CVType.k_fold
@@ -24,9 +29,10 @@ class CVConfig:
24
29
  else:
25
30
  raise Exception(f"Unexpected type of cv_type: {type(cv_type)}")
26
31
 
32
+ self.group_columns = group_columns
27
33
  self.shuffle_kfold: Optional[bool] = shuffle_kfold
28
- self.test_size = 0.2
29
- self.n_folds = 5
34
+ self.test_size = test_size
35
+ self.n_folds = n_folds
30
36
  if (self.cv_type == CVType.k_fold or self.cv_type == CVType.group_k_fold) and self.shuffle_kfold is None:
31
37
  self.shuffle_kfold = date_column is None or is_constant(date_column)
32
38
  if self.shuffle_kfold:
@@ -45,17 +51,37 @@ class CVConfig:
45
51
  config["test_size"] = self.test_size
46
52
  return config
47
53
 
48
- def get_cv(self) -> BaseCrossValidator:
54
+ def get_cv_and_groups(self, X: pd.DataFrame) -> Tuple[BaseCrossValidator, Optional[np.ndarray]]:
49
55
  if self.cv_type == CVType.time_series:
50
- return TimeSeriesSplit(n_splits=self.n_folds)
56
+ return TimeSeriesSplit(n_splits=self.n_folds), None
51
57
  elif self.cv_type == CVType.blocked_time_series:
52
- return BlockedTimeSeriesSplit(n_splits=self.n_folds, test_size=self.test_size)
53
- elif self.cv_type == CVType.group_k_fold and self.shuffle_kfold:
54
- return GroupShuffleSplit(n_splits=self.n_folds, test_size=self.test_size, random_state=self.random_state)
55
- elif self.cv_type == CVType.group_k_fold:
56
- return GroupKFold(n_splits=self.n_folds)
58
+ return BlockedTimeSeriesSplit(n_splits=self.n_folds, test_size=self.test_size), None
59
+ elif self.cv_type == CVType.group_k_fold and self.group_columns:
60
+ groups = get_groups(X, self.group_columns)
61
+
62
+ if groups is None or np.unique(groups).size < self.n_folds:
63
+ return KFold(n_splits=self.n_folds, shuffle=self.shuffle_kfold, random_state=self.random_state), None
64
+
65
+ if self.shuffle_kfold:
66
+ return (
67
+ GroupShuffleSplit(n_splits=self.n_folds, test_size=self.test_size, random_state=self.random_state),
68
+ groups,
69
+ )
70
+ else:
71
+ return GroupKFold(n_splits=self.n_folds), groups
57
72
  else:
58
- return KFold(n_splits=self.n_folds, shuffle=self.shuffle_kfold, random_state=self.random_state)
73
+ return KFold(n_splits=self.n_folds, shuffle=self.shuffle_kfold, random_state=self.random_state), None
74
+
75
+
76
+ def get_groups(X: pd.DataFrame, group_columns: Optional[List[str]]) -> Optional[np.ndarray]:
77
+ existing_group_columns = [c for c in group_columns if c in X.columns]
78
+ return (
79
+ None
80
+ if not group_columns
81
+ else reduce(
82
+ lambda left, right: left + "_" + right, [X[c].astype(str) for c in existing_group_columns]
83
+ ).factorize()[0]
84
+ )
59
85
 
60
86
 
61
87
  def is_constant(s, dropna=True) -> bool:
@@ -9,6 +9,7 @@ from dateutil.relativedelta import relativedelta
9
9
  from pandas.api.types import is_numeric_dtype, is_period_dtype, is_string_dtype
10
10
 
11
11
  from upgini.errors import ValidationError
12
+ from upgini.resource_bundle import ResourceBundle, get_custom_bundle
12
13
 
13
14
  DATE_FORMATS = ["%Y-%m-%d", "%d.%m.%y", "%d.%m.%Y", "%m.%d.%y", "%m.%d.%Y", "%Y-%m-%dT%H:%M:%S.%f"]
14
15
 
@@ -18,7 +19,13 @@ DATETIME_PATTERN = r"^[\d\s\.\-:T]+$"
18
19
  class DateTimeSearchKeyConverter:
19
20
  DATETIME_COL = "_date_time"
20
21
 
21
- def __init__(self, date_column: str, date_format: Optional[str] = None, logger: Optional[logging.Logger] = None):
22
+ def __init__(
23
+ self,
24
+ date_column: str,
25
+ date_format: Optional[str] = None,
26
+ logger: Optional[logging.Logger] = None,
27
+ bundle: ResourceBundle = None,
28
+ ):
22
29
  self.date_column = date_column
23
30
  self.date_format = date_format
24
31
  if logger is not None:
@@ -27,6 +34,7 @@ class DateTimeSearchKeyConverter:
27
34
  self.logger = logging.getLogger()
28
35
  self.logger.setLevel("FATAL")
29
36
  self.generated_features: List[str] = []
37
+ self.bundle = bundle or get_custom_bundle()
30
38
 
31
39
  @staticmethod
32
40
  def _int_to_opt(i: int) -> Optional[int]:
@@ -71,10 +79,10 @@ class DateTimeSearchKeyConverter:
71
79
  df[self.date_column] = pd.to_datetime(df[self.date_column], unit="us")
72
80
  elif df[self.date_column].apply(lambda x: 10**11 < x < 10**14).all():
73
81
  df[self.date_column] = pd.to_datetime(df[self.date_column], unit="ms")
74
- elif df[self.date_column].apply(lambda x: 0 < x < 10*11).all():
82
+ elif df[self.date_column].apply(lambda x: 0 < x < 10 * 11).all():
75
83
  df[self.date_column] = pd.to_datetime(df[self.date_column], unit="s")
76
84
  else:
77
- msg = f"Unsupported type of date column {self.date_column}. Convert to datetime please."
85
+ msg = self.bundle.get("unsupported_date_type").format(self.date_column)
78
86
  self.logger.warning(msg)
79
87
  raise ValidationError(msg)
80
88
 
@@ -121,10 +129,7 @@ class DateTimeSearchKeyConverter:
121
129
  return pd.to_datetime(df[self.date_column], format=date_format)
122
130
  except ValueError:
123
131
  pass
124
- raise ValidationError(
125
- f"Failed to parse date in column `{self.date_column}`. "
126
- "Try to pass explicit date format in date_format argument of FeaturesEnricher constructor"
127
- )
132
+ raise ValidationError(self.bundle.get("invalid_date_format").format(self.date_column))
128
133
 
129
134
 
130
135
  def is_time_series(df: pd.DataFrame, date_col: str) -> bool:
@@ -4,13 +4,18 @@ from typing import Dict, List, Optional, Tuple, Union
4
4
  import pandas as pd
5
5
 
6
6
  from upgini.metadata import SYSTEM_RECORD_ID, TARGET, ModelTaskType, SearchKey
7
- from upgini.resource_bundle import bundle
7
+ from upgini.resource_bundle import ResourceBundle
8
8
  from upgini.utils.datetime_utils import DateTimeSearchKeyConverter
9
9
  from upgini.utils.target_utils import define_task
10
10
 
11
11
 
12
12
  def remove_fintech_duplicates(
13
- df: pd.DataFrame, search_keys: Dict[str, SearchKey], logger: Optional[Logger] = None, silent=False
13
+ df: pd.DataFrame,
14
+ search_keys: Dict[str, SearchKey],
15
+ date_format: Optional[str] = None,
16
+ logger: Optional[Logger] = None,
17
+ silent=False,
18
+ bundle: ResourceBundle = None,
14
19
  ) -> Tuple[bool, pd.DataFrame]:
15
20
  # Base checks
16
21
  need_full_deduplication = True
@@ -72,7 +77,7 @@ def remove_fintech_duplicates(
72
77
  nonunique_target_rows = nonunique_target_groups[nonunique_target_groups].reset_index().drop(columns=TARGET)
73
78
  sub_df = pd.merge(sub_df, nonunique_target_rows, on=personal_cols)
74
79
 
75
- sub_df = DateTimeSearchKeyConverter(date_col).convert(sub_df)
80
+ sub_df = DateTimeSearchKeyConverter(date_col, date_format=date_format, logger=logger, bundle=bundle).convert(sub_df)
76
81
  grouped_by_personal_cols = sub_df.groupby(personal_cols, group_keys=False)
77
82
  rows_with_diff_target = grouped_by_personal_cols.filter(has_diff_target_within_60_days)
78
83
  if len(rows_with_diff_target) > 0:
@@ -95,7 +100,7 @@ def remove_fintech_duplicates(
95
100
 
96
101
 
97
102
  def clean_full_duplicates(
98
- df: pd.DataFrame, logger: Optional[Logger] = None, silent=False
103
+ df: pd.DataFrame, logger: Optional[Logger] = None, silent=False, bundle: ResourceBundle = None
99
104
  ) -> pd.DataFrame:
100
105
  nrows = len(df)
101
106
  if nrows == 0:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: upgini
3
- Version: 1.1.246a101
3
+ Version: 1.1.248a1
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Home-page: https://upgini.com/
6
6
  Author: Upgini Developers
@@ -1,11 +1,11 @@
1
1
  upgini/__init__.py,sha256=asENHgEVHQBIkV-e_0IhE_ZWqkCG6398U3ZLrNzAH6k,407
2
2
  upgini/ads.py,sha256=mre6xn44wcC_fg63iLT_kTh4mViZqR9AKRJZAtpQz8Y,2592
3
- upgini/dataset.py,sha256=AsDJmEfVvdnBrIXQ2DSjitnTQ-5uu1H59YkhjhBRXcw,50424
3
+ upgini/dataset.py,sha256=8TE_NKdhNeL3mlU-S57bHFub4G0gmyOGtceqO-CRaMk,50436
4
4
  upgini/errors.py,sha256=pdzQl3MKuK52yvncxMWMRWeSIOGhUFzpQoszoRFBOk0,958
5
- upgini/features_enricher.py,sha256=mQPM2dbUh-V_HHISFKmT49irNMv1PcLlpUk5HByHhfI,167889
5
+ upgini/features_enricher.py,sha256=xoaG_cEhQJciYHHE-UkjdkprfThNnxGRe9YvT0NVnoI,168617
6
6
  upgini/fingerprint.js,sha256=VygVIQlN1v4NGZfjHqtRogOw8zjTnnMNJg_f7M5iGQU,33442
7
7
  upgini/http.py,sha256=eSG4gOpmCGlXmB6KIPNzAG8tRZNUjyYpMeUeHw_2li4,42264
8
- upgini/metadata.py,sha256=55t0uQI910tzTcnwxZCUL1413BhTiSm8oqiwp-94NyA,9613
8
+ upgini/metadata.py,sha256=5SQjDsQrfpGKT42i0H40C6wxTY_up94xSqAARH140_U,9637
9
9
  upgini/metrics.py,sha256=LS2MgEKgmn9VEXsKzxv3pBZ-q71mTnpWu6vL8fYgpo4,26727
10
10
  upgini/search_task.py,sha256=5n4qGJmtu48s0-FHAtF3L5qVLMd1JVW3FJlM8dFbh-s,17063
11
11
  upgini/spinner.py,sha256=Dm1dQ5F_z_Ua2odLxZX7OypcOX9tSx_vE5MGaKtUmfw,1118
@@ -17,8 +17,8 @@ upgini/autofe/all_operands.py,sha256=du44N6ISWe3ikb0y9ZzSOHNbLiyEYrJPwoBo0Z6xp2s
17
17
  upgini/autofe/binary.py,sha256=f8LQqZi9zyaMUAv-jASMmWNA_vT05ncYCjZq0qx3USs,3972
18
18
  upgini/autofe/feature.py,sha256=xeqTq35-BX4KCt0xAkk3UZAGzV5VyjorV5AdNdA5yLs,11851
19
19
  upgini/autofe/groupby.py,sha256=iXRfOmOc84ooSzRhsh9GmmG7rTafX0-ekXko8s9Qs68,3089
20
- upgini/autofe/operand.py,sha256=GpSx-nL2XKnTJ7kvRr_SIFoUMchqYian6SftJ82zsN4,2719
21
- upgini/autofe/unary.py,sha256=WB-Ovwaz2a-Jscpshg1Om7Ttx6DJ6gQ_fgqtXx_UHuw,2845
20
+ upgini/autofe/operand.py,sha256=Rhy7Ky3we-I1Su1--dS4xdsO3K8neV4rqM_Q4xYE4ug,2779
21
+ upgini/autofe/unary.py,sha256=gyMkrx9bfa3o19zS-4JaRlScHrfeZGBsYe7d_6ePT-0,2853
22
22
  upgini/autofe/vector.py,sha256=Qk7VmdwURNwVw7fIMEspWEo7HTiyUWCYIqu3hcWQQio,507
23
23
  upgini/data_source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
24
24
  upgini/data_source/data_source_publisher.py,sha256=PeotNz-taBT6aweQc9xc_pdtUOqmqe1IZr1-3NeUsd0,14008
@@ -28,8 +28,8 @@ upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU
28
28
  upgini/normalizer/phone_normalizer.py,sha256=lhwsPEnfyjeIsndW2EcQGZksXYsfxaQ1ghAzVYoDRKM,9927
29
29
  upgini/resource_bundle/__init__.py,sha256=hdvbqL0b0xMWbY6-kiYGsW1ro2GMiWpxxsO9uCv-h9Q,8379
30
30
  upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
31
- upgini/resource_bundle/strings.properties,sha256=0qDpfZBMU4moFXKzb11ALwk5N8FdgyJYzI0wRTvFJbI,25008
32
- upgini/resource_bundle/strings_widget.properties,sha256=IjPHslEk_fx95Phz3NV9PhVYxqexJgAV9xvZoRPvURc,1433
31
+ upgini/resource_bundle/strings.properties,sha256=dmkObOr0FJPkGQmiybmFcI3RbYjl_uCz_IQ4nm8SRkk,25242
32
+ upgini/resource_bundle/strings_widget.properties,sha256=gOdqvZWntP2LCza_tyVk1_yRYcG4c04K9sQOAVhF_gw,1577
33
33
  upgini/sampler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
34
34
  upgini/sampler/base.py,sha256=CC-DvPbrN7zp5--SVFuUqkVmdWM_5F7R0Do98ETV82U,6421
35
35
  upgini/sampler/random_under_sampler.py,sha256=XU4c2swPIFxVXHOPpxgM2bUao0Xm-aoMmd6fKjIuV5s,4068
@@ -39,9 +39,9 @@ upgini/utils/base_search_key_detector.py,sha256=DGwhXLvc8i5VZWMDr0rncFfV5GEHdsCS
39
39
  upgini/utils/blocked_time_series.py,sha256=dMz5ewk3PsoeOrc3lDzInCVPS9u_2XQkV0W6PuMMjPg,3380
40
40
  upgini/utils/country_utils.py,sha256=1KXhLSNqkNYVL3on8-zK0Arc_SspUH7AMZvGZICysOU,6462
41
41
  upgini/utils/custom_loss_utils.py,sha256=DBslpjWGPt7xTeypt78baR59012SYphbPsO_YLKdilo,3972
42
- upgini/utils/cv_utils.py,sha256=6pSSL_Ft_8C6n6aInJeiyeSBD7McjsMxKZpHqSBV0uY,2491
43
- upgini/utils/datetime_utils.py,sha256=awsLpnFjBNcrsCDyyiiJLicHgHiGCNAwi0UOwRKGD7s,8645
44
- upgini/utils/deduplicate_utils.py,sha256=qRmytTgheYM6OfLnURrmdkZVN02vvZXL5bm0Yob3svk,5995
42
+ upgini/utils/cv_utils.py,sha256=Tn01RJvpZGZh0PUQUimlBkV-AXwe7s6yjCNFtw352Uc,3525
43
+ upgini/utils/datetime_utils.py,sha256=P5no4mFgYpEP6oY524ebTKvKc3TBMJzAYpWdj210_Fw,8699
44
+ upgini/utils/deduplicate_utils.py,sha256=GRPwD8bXZNspKvf19W3SrYjqg1qQMDlZD-BDkHnKYyo,6176
45
45
  upgini/utils/display_utils.py,sha256=tiq5sFOfMwkKCjQ7OGdyK_twe0Qdr9F3mzkW1QXSDog,10664
46
46
  upgini/utils/email_utils.py,sha256=3CvHXTSzlgLyGsQOXfRYVfFhfPy6OXG4uXOBWRaLfHg,3479
47
47
  upgini/utils/fallback_progress_bar.py,sha256=cdbd1XGcWm4Ed4eAqV2_St3z7uC_kkH22gEyrN5ub6M,1090
@@ -55,8 +55,8 @@ upgini/utils/sklearn_ext.py,sha256=fvuTWJ5AnT3ED9KSaQu_yIgW2JR19hFlaGDoVP3k60g,4
55
55
  upgini/utils/target_utils.py,sha256=DH812qcZ7Pvf9WVVb33fbwQjb1W9h1hXRNCCiG7Y6tI,2563
56
56
  upgini/utils/track_info.py,sha256=EPcJ13Jqa17_T0JjM37Ac9kWDz5Zk0GVsIZKutOb8aU,5207
57
57
  upgini/utils/warning_counter.py,sha256=dIWBB4dI5XRRJZudvIlqlIYKEiwLLPcXarsZuYRt338,227
58
- upgini-1.1.246a101.dist-info/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
59
- upgini-1.1.246a101.dist-info/METADATA,sha256=VfuzobkBVkcJyRz_DEihRaROyfGPYnczR537L6mx2GU,48210
60
- upgini-1.1.246a101.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
61
- upgini-1.1.246a101.dist-info/top_level.txt,sha256=OFhTGiDIWKl5gFI49qvWq1R9IKflPaE2PekcbDXDtx4,7
62
- upgini-1.1.246a101.dist-info/RECORD,,
58
+ upgini-1.1.248a1.dist-info/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
59
+ upgini-1.1.248a1.dist-info/METADATA,sha256=leV6Cyrr3xf01Jd34x-1asI-e4rk2bjDPJvdUsaagNg,48208
60
+ upgini-1.1.248a1.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
61
+ upgini-1.1.248a1.dist-info/top_level.txt,sha256=OFhTGiDIWKl5gFI49qvWq1R9IKflPaE2PekcbDXDtx4,7
62
+ upgini-1.1.248a1.dist-info/RECORD,,