upgini 1.1.237a1__py3-none-any.whl → 1.1.239a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

upgini/autofe/operand.py CHANGED
@@ -1,5 +1,5 @@
1
1
  from pydantic import BaseModel
2
- from typing import Dict, List, Optional, Tuple
2
+ from typing import Dict, List, Optional, Tuple, Union
3
3
  import abc
4
4
  import pandas as pd
5
5
  import numpy as np
@@ -59,6 +59,16 @@ class PandasOperand(Operand, abc.ABC):
59
59
  df_from.loc[np.nan] = np.nan
60
60
  return df_to.fillna(np.nan).apply(lambda x: df_from.loc[x])
61
61
 
62
+ def _round_value(self, value: Union[pd.Series, pd.DataFrame]) -> Union[pd.Series, pd.DataFrame]:
63
+ if isinstance(value, pd.DataFrame):
64
+ return value.apply(self._round_value, axis=1)
65
+
66
+ if np.issubdtype(value.dtype, np.floating):
67
+ precision = np.finfo(value.dtype).precision
68
+ return np.trunc(value * 10**precision) / (10**precision)
69
+ else:
70
+ return value
71
+
62
72
 
63
73
  class VectorizableMixin(Operand):
64
74
  def validate_calculation(self, input_columns: List[str], **kwargs) -> Tuple[str, List[str]]:
upgini/autofe/unary.py CHANGED
@@ -22,10 +22,10 @@ class Log(PandasOperand):
22
22
  output_type = "float"
23
23
 
24
24
  def calculate_unary(self, data: pd.Series) -> pd.Series:
25
- return np.log(np.abs(data.replace(0, np.nan)))
25
+ return self._round_value(np.log(np.abs(data.replace(0, np.nan))))
26
26
 
27
27
  def calculate_group(self, data: pd.DataFrame, **kwargs) -> pd.DataFrame:
28
- return np.log(data.replace(0, np.nan).abs())
28
+ return self._round_value(np.log(data.replace(0, np.nan).abs()))
29
29
 
30
30
 
31
31
  class Sqrt(PandasOperand):
@@ -35,10 +35,10 @@ class Sqrt(PandasOperand):
35
35
  output_type = "float"
36
36
 
37
37
  def calculate_unary(self, data: pd.Series) -> pd.Series:
38
- return np.sqrt(np.abs(data))
38
+ return self._round_value(np.sqrt(np.abs(data)))
39
39
 
40
40
  def calculate_group(self, data: pd.DataFrame, **kwargs) -> pd.DataFrame:
41
- return np.sqrt(data.abs())
41
+ return self._round_value(np.sqrt(data.abs()))
42
42
 
43
43
 
44
44
  class Square(PandasOperand):
@@ -60,10 +60,10 @@ class Sigmoid(PandasOperand):
60
60
  output_type = "float"
61
61
 
62
62
  def calculate_unary(self, data: pd.Series) -> pd.Series:
63
- return 1 / (1 + np.exp(-data))
63
+ return self._round_value(1 / (1 + np.exp(-data)))
64
64
 
65
65
  def calculate_group(self, data: pd.DataFrame, **kwargs) -> pd.DataFrame:
66
- return 1 / (1 + np.exp(-data))
66
+ return self._round_value(1 / (1 + np.exp(-data)))
67
67
 
68
68
 
69
69
  class Floor(PandasOperand):
@@ -62,6 +62,7 @@ class DataSourcePublisher:
62
62
  trace_id = str(uuid.uuid4())
63
63
 
64
64
  with MDC(trace_id=trace_id):
65
+ task_id = None
65
66
  try:
66
67
  if data_table_uri is None or not data_table_uri.startswith("bq://"):
67
68
  raise ValidationError(
@@ -148,6 +149,12 @@ class DataSourcePublisher:
148
149
  self.logger.info(msg)
149
150
  print(msg)
150
151
  return data_table_id
152
+ except KeyboardInterrupt:
153
+ if task_id is not None:
154
+ msg = f"Stopping AdsManagementTask {task_id}"
155
+ print(msg)
156
+ self.logger.info(msg)
157
+ self._rest_client.stop_ads_management_task(task_id, trace_id)
151
158
  except Exception:
152
159
  self.logger.exception("Failed to register data table")
153
160
  raise
upgini/dataset.py CHANGED
@@ -36,14 +36,12 @@ from upgini.metadata import (
36
36
  NumericInterval,
37
37
  RuntimeParameters,
38
38
  SearchCustomization,
39
- SearchKey,
40
39
  )
41
40
  from upgini.normalizer.phone_normalizer import PhoneNormalizer
42
41
  from upgini.resource_bundle import bundle
43
42
  from upgini.sampler.random_under_sampler import RandomUnderSampler
44
43
  from upgini.search_task import SearchTask
45
44
  from upgini.utils import combine_search_keys
46
- from upgini.utils.deduplicate_utils import remove_fintech_duplicates
47
45
  from upgini.utils.email_utils import EmailSearchKeyConverter
48
46
 
49
47
  try:
@@ -83,7 +81,6 @@ class Dataset: # (pd.DataFrame):
83
81
  endpoint: Optional[str] = None,
84
82
  api_key: Optional[str] = None,
85
83
  logger: Optional[logging.Logger] = None,
86
- client_ip: Optional[str] = None,
87
84
  warning_counter: Optional[WarningCounter] = None,
88
85
  **kwargs,
89
86
  ):
@@ -127,7 +124,6 @@ class Dataset: # (pd.DataFrame):
127
124
  else:
128
125
  self.logger = logging.getLogger()
129
126
  self.logger.setLevel("FATAL")
130
- self.client_ip = client_ip
131
127
  self.warning_counter = warning_counter or WarningCounter()
132
128
 
133
129
  def __len__(self):
@@ -822,13 +818,6 @@ class Dataset: # (pd.DataFrame):
822
818
  self.__validate_dataset(validate_target, silent_mode)
823
819
 
824
820
  if validate_target:
825
- search_keys = {
826
- col: SearchKey.from_meaning_type(key_type)
827
- for col, key_type in self.meaning_types.items()
828
- if SearchKey.from_meaning_type(key_type) is not None
829
- }
830
- self.data = remove_fintech_duplicates(self.data, search_keys, self.logger)
831
-
832
821
  self.__validate_target()
833
822
 
834
823
  self.__resample()
@@ -1028,7 +1017,7 @@ class Dataset: # (pd.DataFrame):
1028
1017
  task_type=self.task_type,
1029
1018
  endpoint=self.endpoint,
1030
1019
  api_key=self.api_key,
1031
- client_ip=self.client_ip,
1020
+ logger=self.logger,
1032
1021
  )
1033
1022
 
1034
1023
  def validation(
@@ -1098,7 +1087,7 @@ class Dataset: # (pd.DataFrame):
1098
1087
  initial_search_task_id=initial_search_task_id,
1099
1088
  endpoint=self.endpoint,
1100
1089
  api_key=self.api_key,
1101
- client_ip=self.client_ip,
1090
+ logger=self.logger,
1102
1091
  )
1103
1092
 
1104
1093
  def prepare_uploading_file(self, base_path: str) -> str:
@@ -21,8 +21,8 @@ from scipy.stats import ks_2samp
21
21
  from sklearn.base import TransformerMixin
22
22
  from sklearn.exceptions import NotFittedError
23
23
  from sklearn.model_selection import BaseCrossValidator
24
- from upgini.autofe.feature import Feature
25
24
 
25
+ from upgini.autofe.feature import Feature
26
26
  from upgini.data_source.data_source_publisher import CommercialSchema
27
27
  from upgini.dataset import Dataset
28
28
  from upgini.errors import HttpError, ValidationError
@@ -64,7 +64,6 @@ from upgini.utils.datetime_utils import (
64
64
  is_blocked_time_series,
65
65
  is_time_series,
66
66
  )
67
- from upgini.utils.deduplicate_utils import remove_fintech_duplicates
68
67
  from upgini.utils.display_utils import (
69
68
  display_html_dataframe,
70
69
  do_without_pandas_limits,
@@ -181,17 +180,19 @@ class FeaturesEnricher(TransformerMixin):
181
180
  exclude_columns: Optional[List[str]] = None,
182
181
  baseline_score_column: Optional[Any] = None,
183
182
  client_ip: Optional[str] = None,
183
+ client_visitorid: Optional[str] = None,
184
184
  **kwargs,
185
185
  ):
186
186
  self._api_key = api_key or os.environ.get(UPGINI_API_KEY)
187
187
  if api_key is not None and not isinstance(api_key, str):
188
188
  raise ValidationError(f"api_key should be `string`, but passed: `{api_key}`")
189
- self.rest_client = get_rest_client(endpoint, self._api_key)
189
+ self.rest_client = get_rest_client(endpoint, self._api_key, client_ip, client_visitorid)
190
190
  self.client_ip = client_ip
191
+ self.client_visitorid = client_visitorid
191
192
 
192
193
  self.logs_enabled = logs_enabled
193
194
  if logs_enabled:
194
- self.logger = LoggerFactory().get_logger(endpoint, self._api_key, client_ip)
195
+ self.logger = LoggerFactory().get_logger(endpoint, self._api_key, client_ip, client_visitorid)
195
196
  else:
196
197
  self.logger = logging.getLogger()
197
198
  self.logger.setLevel("FATAL")
@@ -232,7 +233,7 @@ class FeaturesEnricher(TransformerMixin):
232
233
  self.feature_importances_ = []
233
234
  self.search_id = search_id
234
235
  if search_id:
235
- search_task = SearchTask(search_id, endpoint=self.endpoint, api_key=self._api_key, client_ip=client_ip)
236
+ search_task = SearchTask(search_id, endpoint=self.endpoint, api_key=self._api_key, logger=self.logger)
236
237
 
237
238
  print(bundle.get("search_by_task_id_start"))
238
239
  trace_id = str(uuid.uuid4())
@@ -296,7 +297,7 @@ class FeaturesEnricher(TransformerMixin):
296
297
  def _set_api_key(self, api_key: str):
297
298
  self._api_key = api_key
298
299
  if self.logs_enabled:
299
- self.logger = LoggerFactory().get_logger(self.endpoint, self._api_key, self.client_ip)
300
+ self.logger = LoggerFactory().get_logger(self.endpoint, self._api_key, self.client_ip, self.client_visitorid)
300
301
 
301
302
  api_key = property(_get_api_key, _set_api_key)
302
303
 
@@ -679,7 +680,7 @@ class FeaturesEnricher(TransformerMixin):
679
680
  return None
680
681
 
681
682
  if not metrics_calculation:
682
- transform_usage = get_rest_client(self.endpoint, self.api_key).get_current_transform_usage(trace_id)
683
+ transform_usage = self.rest_client.get_current_transform_usage(trace_id)
683
684
  self.logger.info(f"Current transform usage: {transform_usage}. Transforming {len(X)} rows")
684
685
  if transform_usage.has_limit:
685
686
  if len(X) > transform_usage.rest_rows:
@@ -1184,8 +1185,8 @@ class FeaturesEnricher(TransformerMixin):
1184
1185
  converter = DateTimeSearchKeyConverter(date_column, self.date_format, self.logger)
1185
1186
  extended_X = converter.convert(extended_X, keep_time=True)
1186
1187
  generated_features.extend(converter.generated_features)
1187
- email_column = self._get_email_column(search_keys)
1188
- hem_column = self._get_hem_column(search_keys)
1188
+ email_column = self.__get_email_column(search_keys)
1189
+ hem_column = self.__get_hem_column(search_keys)
1189
1190
  if email_column:
1190
1191
  converter = EmailSearchKeyConverter(email_column, hem_column, search_keys, self.logger)
1191
1192
  extended_X = converter.convert(extended_X)
@@ -1451,7 +1452,9 @@ class FeaturesEnricher(TransformerMixin):
1451
1452
  rows_to_drop=rows_to_drop,
1452
1453
  )
1453
1454
 
1454
- original_df_sampled = self.df_with_original_index[self.df_with_original_index[SYSTEM_RECORD_ID].isin(fit_features[SYSTEM_RECORD_ID])]
1455
+ original_df_sampled = self.df_with_original_index[
1456
+ self.df_with_original_index[SYSTEM_RECORD_ID].isin(fit_features[SYSTEM_RECORD_ID])
1457
+ ]
1455
1458
  enriched_X = drop_existing_columns(enriched_Xy, TARGET)
1456
1459
  if EVAL_SET_INDEX in original_df_sampled.columns:
1457
1460
  Xy_sampled = original_df_sampled.query(f"{EVAL_SET_INDEX} == 0")
@@ -1506,8 +1509,6 @@ class FeaturesEnricher(TransformerMixin):
1506
1509
  eval_df_with_index[TARGET] = eval_y
1507
1510
  eval_df_with_index[EVAL_SET_INDEX] = idx + 1
1508
1511
  df_with_eval_set_index = pd.concat([df_with_eval_set_index, eval_df_with_index])
1509
-
1510
- df_with_eval_set_index = remove_fintech_duplicates(df_with_eval_set_index, self.search_keys, self.logger)
1511
1512
 
1512
1513
  # downsample if need to eval_set threshold
1513
1514
  num_samples = _num_samples(df_with_eval_set_index)
@@ -1744,8 +1745,8 @@ class FeaturesEnricher(TransformerMixin):
1744
1745
  generated_features.extend(converter.generated_features)
1745
1746
  else:
1746
1747
  self.logger.info("Input dataset hasn't date column")
1747
- email_column = self._get_email_column(search_keys)
1748
- hem_column = self._get_hem_column(search_keys)
1748
+ email_column = self.__get_email_column(search_keys)
1749
+ hem_column = self.__get_hem_column(search_keys)
1749
1750
  email_converted_to_hem = False
1750
1751
  if email_column:
1751
1752
  converter = EmailSearchKeyConverter(email_column, hem_column, search_keys, self.logger)
@@ -1806,7 +1807,6 @@ class FeaturesEnricher(TransformerMixin):
1806
1807
  api_key=self.api_key, # type: ignore
1807
1808
  date_format=self.date_format, # type: ignore
1808
1809
  logger=self.logger,
1809
- client_ip=self.client_ip,
1810
1810
  )
1811
1811
  dataset.meaning_types = meaning_types
1812
1812
  dataset.search_keys = combined_search_keys
@@ -1869,7 +1869,7 @@ class FeaturesEnricher(TransformerMixin):
1869
1869
  progress = self.get_progress(trace_id, validation_task)
1870
1870
  except KeyboardInterrupt as e:
1871
1871
  print(bundle.get("search_stopping"))
1872
- get_rest_client(self.endpoint, self.api_key).stop_search_task_v2(
1872
+ self.rest_client.stop_search_task_v2(
1873
1873
  trace_id, validation_task.search_task_id
1874
1874
  )
1875
1875
  self.logger.warning(f"Search {validation_task.search_task_id} stopped by user")
@@ -2084,8 +2084,8 @@ class FeaturesEnricher(TransformerMixin):
2084
2084
  self.fit_generated_features.extend(converter.generated_features)
2085
2085
  else:
2086
2086
  self.logger.info("Input dataset hasn't date column")
2087
- email_column = self._get_email_column(self.fit_search_keys)
2088
- hem_column = self._get_hem_column(self.fit_search_keys)
2087
+ email_column = self.__get_email_column(self.fit_search_keys)
2088
+ hem_column = self.__get_hem_column(self.fit_search_keys)
2089
2089
  email_converted_to_hem = False
2090
2090
  if email_column:
2091
2091
  converter = EmailSearchKeyConverter(email_column, hem_column, self.fit_search_keys, self.logger)
@@ -2141,7 +2141,6 @@ class FeaturesEnricher(TransformerMixin):
2141
2141
  date_format=self.date_format, # type: ignore
2142
2142
  random_state=self.random_state, # type: ignore
2143
2143
  logger=self.logger,
2144
- client_ip=self.client_ip,
2145
2144
  )
2146
2145
  dataset.meaning_types = meaning_types
2147
2146
  dataset.search_keys = combined_search_keys
@@ -2198,7 +2197,7 @@ class FeaturesEnricher(TransformerMixin):
2198
2197
  progress = self.get_progress(trace_id)
2199
2198
  except KeyboardInterrupt as e:
2200
2199
  print(bundle.get("search_stopping"))
2201
- get_rest_client(self.endpoint, self.api_key).stop_search_task_v2(trace_id, self._search_task.search_task_id)
2200
+ self.rest_client.stop_search_task_v2(trace_id, self._search_task.search_task_id)
2202
2201
  self.logger.warning(f"Search {self._search_task.search_task_id} stopped by user")
2203
2202
  print(bundle.get("search_stopped"))
2204
2203
  raise e
@@ -2618,22 +2617,16 @@ class FeaturesEnricher(TransformerMixin):
2618
2617
  return [col for col, t in search_keys.items() if t not in [SearchKey.DATE, SearchKey.DATETIME]]
2619
2618
 
2620
2619
  @staticmethod
2621
- def _get_email_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
2620
+ def __get_email_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
2622
2621
  for col, t in search_keys.items():
2623
2622
  if t == SearchKey.EMAIL:
2624
2623
  return col
2625
2624
 
2626
2625
  @staticmethod
2627
- def _get_hem_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
2626
+ def __get_hem_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
2628
2627
  for col, t in search_keys.items():
2629
2628
  if t == SearchKey.HEM:
2630
2629
  return col
2631
-
2632
- @staticmethod
2633
- def _get_phone_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
2634
- for col, t in search_keys.items():
2635
- if t == SearchKey.PHONE:
2636
- return col
2637
2630
 
2638
2631
  def __add_fit_system_record_id(
2639
2632
  self, df: pd.DataFrame, meaning_types: Dict[str, FileColumnMeaningType], search_keys: Dict[str, SearchKey]
@@ -3190,7 +3183,7 @@ class FeaturesEnricher(TransformerMixin):
3190
3183
  metrics_df=self.metrics,
3191
3184
  autofe_descriptions_df=self.get_autofe_features_description(),
3192
3185
  search_id=self._search_task.search_task_id,
3193
- email=get_rest_client(self.endpoint, self.api_key).get_current_email(),
3186
+ email=self.rest_client.get_current_email(),
3194
3187
  search_keys=[str(sk) for sk in self.search_keys.values()],
3195
3188
  )
3196
3189
  except Exception:
@@ -3374,7 +3367,7 @@ class FeaturesEnricher(TransformerMixin):
3374
3367
  pickle.dump(sample(eval_set[0][0], eval_xy_sample_index), eval_x_file)
3375
3368
  with open(f"{tmp_dir}/eval_y.pickle", "wb") as eval_y_file:
3376
3369
  pickle.dump(sample(eval_set[0][1], eval_xy_sample_index), eval_y_file)
3377
- get_rest_client(self.endpoint, self.api_key).dump_input_files(
3370
+ self.rest_client.dump_input_files(
3378
3371
  trace_id,
3379
3372
  f"{tmp_dir}/x.pickle",
3380
3373
  f"{tmp_dir}/y.pickle",
@@ -3382,13 +3375,13 @@ class FeaturesEnricher(TransformerMixin):
3382
3375
  f"{tmp_dir}/eval_y.pickle",
3383
3376
  )
3384
3377
  else:
3385
- get_rest_client(self.endpoint, self.api_key).dump_input_files(
3378
+ self.rest_client.dump_input_files(
3386
3379
  trace_id,
3387
3380
  f"{tmp_dir}/x.pickle",
3388
3381
  f"{tmp_dir}/y.pickle",
3389
3382
  )
3390
3383
  else:
3391
- get_rest_client(self.endpoint, self.api_key).dump_input_files(
3384
+ self.rest_client.dump_input_files(
3392
3385
  trace_id,
3393
3386
  f"{tmp_dir}/x.pickle",
3394
3387
  )
upgini/http.py CHANGED
@@ -289,7 +289,7 @@ class _RestClient:
289
289
  GET_ALL_ADS_DESCRIPTIONS_URI = "private/api/v2/ads/descriptions"
290
290
  GET_ACTIVE_ADS_DEFINITIONS_URI = "private/api/v2/ads/definitions"
291
291
  UPLOAD_ONLINE_URI = "private/api/v2/ads/upload-online"
292
- UPLOAD_ONLINE_ALL_URI = "private/api/v2/ads/upload-online-all"
292
+ STOP_ADS_MANAGEMENT_TASK_URI_FMT = "private/api/v2/ads/management-task/{0}/stop"
293
293
 
294
294
  ACCESS_TOKEN_HEADER_NAME = "Authorization"
295
295
  CONTENT_TYPE_HEADER_NAME = "Content-Type"
@@ -301,11 +301,13 @@ class _RestClient:
301
301
  USER_AGENT_HEADER_VALUE = "pyupgini/" + __version__
302
302
  SEARCH_KEYS_HEADER_NAME = "Search-Keys"
303
303
 
304
- def __init__(self, service_endpoint, refresh_token, silent_mode=False):
304
+ def __init__(self, service_endpoint, refresh_token, silent_mode=False, client_ip=None, client_visitorid=None):
305
305
  # debug_requests_on()
306
306
  self._service_endpoint = service_endpoint
307
307
  self._refresh_token = refresh_token
308
308
  self.silent_mode = silent_mode
309
+ self.client_ip = client_ip
310
+ self.client_visitorid = client_visitorid
309
311
  self._access_token = self._refresh_access_token()
310
312
  # self._access_token: Optional[str] = None # self._refresh_access_token()
311
313
  self.last_refresh_time = time.time()
@@ -470,7 +472,7 @@ class _RestClient:
470
472
  )
471
473
  files["tracking"] = (
472
474
  "tracking.json",
473
- dumps(get_track_metrics()).encode(),
475
+ dumps(get_track_metrics(self.client_ip, self.client_visitorid)).encode(),
474
476
  "application/json",
475
477
  )
476
478
  additional_headers = {self.SEARCH_KEYS_HEADER_NAME: ",".join(self.search_keys_meaning_types(metadata))}
@@ -554,7 +556,7 @@ class _RestClient:
554
556
  )
555
557
  files["tracking"] = (
556
558
  "ide",
557
- dumps(get_track_metrics()).encode(),
559
+ dumps(get_track_metrics(self.client_ip, self.client_visitorid)).encode(),
558
560
  "application/json",
559
561
  )
560
562
 
@@ -662,7 +664,7 @@ class _RestClient:
662
664
  return ProviderTaskMetadataV2.parse_obj(response)
663
665
 
664
666
  def get_current_transform_usage(self, trace_id) -> TransformUsage:
665
- track_metrics = get_track_metrics()
667
+ track_metrics = get_track_metrics(self.client_ip, self.client_visitorid)
666
668
  visitor_id = track_metrics.get("visitorId")
667
669
  response = self._with_unauth_retry(
668
670
  lambda: self._send_get_req(
@@ -751,6 +753,10 @@ class _RestClient:
751
753
  response = self._with_unauth_retry(lambda: self._send_post_req(api_path, trace_id, request, result_format=None))
752
754
  return response["adsManagementTaskId"]
753
755
 
756
+ def stop_ads_management_task(self, ads_management_task_id: str, trace_id: str):
757
+ api_path = self.STOP_ADS_MANAGEMENT_TASK_URI_FMT.format(ads_management_task_id)
758
+ self._with_unauth_retry(lambda: self._send_post_req(api_path, trace_id))
759
+
754
760
  # ---
755
761
 
756
762
  def _send_get_req(self, api_path: str, trace_id: Optional[str], additional_headers: Optional[dict] = None):
@@ -901,11 +907,12 @@ def resolve_api_token(api_token: Optional[str]) -> str:
901
907
  return DEMO_API_KEY
902
908
 
903
909
 
904
- def get_rest_client(backend_url: Optional[str] = None, api_token: Optional[str] = None) -> _RestClient:
910
+ def get_rest_client(backend_url: Optional[str] = None, api_token: Optional[str] = None,
911
+ client_ip: Optional[str] = None, client_visitorid: Optional[str] = None) -> _RestClient:
905
912
  url = _resolve_backend_url(backend_url)
906
913
  token = resolve_api_token(api_token)
907
914
 
908
- return _get_rest_client(url, token)
915
+ return _get_rest_client(url, token, client_ip, client_visitorid)
909
916
 
910
917
 
911
918
  def is_demo_api_key(api_token: Optional[str]) -> bool:
@@ -913,23 +920,27 @@ def is_demo_api_key(api_token: Optional[str]) -> bool:
913
920
 
914
921
 
915
922
  @lru_cache()
916
- def _get_rest_client(backend_url: str, api_token: str) -> _RestClient:
923
+ def _get_rest_client(backend_url: str, api_token: str,
924
+ client_ip: Optional[str] = None, client_visitorid: Optional[str] = None) -> _RestClient:
917
925
  return _RestClient(backend_url, api_token)
918
926
 
919
927
 
920
928
  class BackendLogHandler(logging.Handler):
921
- def __init__(self, rest_client: _RestClient, client_ip: Optional[str] = None, *args, **kwargs) -> None:
929
+ def __init__(self, rest_client: _RestClient,
930
+ client_ip: Optional[str] = None, client_visitorid: Optional[str] = None,
931
+ *args, **kwargs) -> None:
922
932
  super().__init__(*args, **kwargs)
923
933
  self.rest_client = rest_client
924
934
  self.track_metrics = None
925
935
  self.hostname = "0.0.0.0"
926
936
  self.client_ip = client_ip
937
+ self.client_visitorid = client_visitorid
927
938
 
928
939
  def emit(self, record: logging.LogRecord) -> None:
929
940
  def task():
930
941
  try:
931
942
  if self.track_metrics is None or len(self.track_metrics) == 0:
932
- self.track_metrics = get_track_metrics(self.client_ip)
943
+ self.track_metrics = get_track_metrics(self.client_ip, self.client_visitorid)
933
944
  self.hostname = self.track_metrics.get("ip") or "0.0.0.0"
934
945
  text = self.format(record)
935
946
  tags = self.track_metrics
@@ -971,7 +982,8 @@ class LoggerFactory:
971
982
  root.handlers.clear()
972
983
 
973
984
  def get_logger(
974
- self, backend_url: Optional[str] = None, api_token: Optional[str] = None, client_ip: Optional[str] = None
985
+ self, backend_url: Optional[str] = None, api_token: Optional[str] = None,
986
+ client_ip: Optional[str] = None, client_visitorid: Optional[str] = None
975
987
  ) -> logging.Logger:
976
988
  url = _resolve_backend_url(backend_url)
977
989
  token = resolve_api_token(api_token)
@@ -983,7 +995,7 @@ class LoggerFactory:
983
995
  upgini_logger = logging.getLogger(f"upgini.{hash(key)}")
984
996
  upgini_logger.handlers.clear()
985
997
  rest_client = get_rest_client(backend_url, api_token)
986
- datadog_handler = BackendLogHandler(rest_client, client_ip)
998
+ datadog_handler = BackendLogHandler(rest_client, client_ip, client_visitorid)
987
999
  json_formatter = jsonlogger.JsonFormatter(
988
1000
  "%(asctime)s %(threadName)s %(name)s %(levelname)s %(message)s",
989
1001
  timestamp=True,
@@ -142,7 +142,6 @@ dataset_empty_column_names=Some column names are empty. Add names please
142
142
  dataset_too_long_column_name=Column {} is too long: {} characters. Remove this column or trim length to 50 characters
143
143
  dataset_full_duplicates=\nWARNING: {:.5f}% of the rows are fully duplicated
144
144
  dataset_diff_target_duplicates=\nWARNING: {:.4f}% of rows ({}) in X and eval_set are duplicates with different y values. These rows will be deleted as incorrect\nIncorrect row indexes: {}
145
- dataset_diff_target_duplicates_fintech=\nWARNING: {:.4f}% of rows ({}) in X and eval_set are duplicates, not taking into consideration dates, IP addresses and features from the training set, but have different y values. These rows have been removed to optimize search results.\nRemoved row indexes: {}
146
145
  dataset_drop_old_dates=\nWARNING: We don't have data before '2000-01-01' and removed all earlier records from the search dataset
147
146
  dataset_all_dates_old=There is empty train dataset after removing data before '2000-01-01'
148
147
  dataset_invalid_target_type=Unexpected dtype of target for binary task type: {}. Expected int or bool
upgini/search_task.py CHANGED
@@ -1,3 +1,4 @@
1
+ import logging
1
2
  import tempfile
2
3
  import time
3
4
  from functools import lru_cache
@@ -43,7 +44,7 @@ class SearchTask:
43
44
  task_type: Optional[ModelTaskType] = None,
44
45
  endpoint: Optional[str] = None,
45
46
  api_key: Optional[str] = None,
46
- client_ip: Optional[str] = None,
47
+ logger: Optional[logging.Logger] = None,
47
48
  ):
48
49
  self.search_task_id = search_task_id
49
50
  self.initial_search_task_id = initial_search_task_id
@@ -55,7 +56,11 @@ class SearchTask:
55
56
  self.summary = None
56
57
  self.endpoint = endpoint
57
58
  self.api_key = api_key
58
- self.logger = LoggerFactory().get_logger(endpoint, api_key, client_ip)
59
+ if logger is not None:
60
+ self.logger = logger
61
+ else:
62
+ self.logger = logging.getLogger()
63
+ self.logger.setLevel("FATAL")
59
64
  self.provider_metadata_v2: Optional[List[ProviderTaskMetadataV2]] = None
60
65
  self.unused_features_for_generation: Optional[List[str]] = None
61
66
 
@@ -61,22 +61,9 @@ class DateTimeSearchKeyConverter:
61
61
  elif is_period_dtype(df[self.date_column]):
62
62
  df[self.date_column] = pd.to_datetime(df[self.date_column].astype("string"))
63
63
  elif is_numeric_dtype(df[self.date_column]):
64
- # 315532801 - 2524608001 - seconds
65
- # 315532801000 - 2524608001000 - milliseconds
66
- # 315532801000000 - 2524608001000000 - microseconds
67
- # 315532801000000000 - 2524608001000000000 - nanoseconds
68
- if df[self.date_column].apply(lambda x: 10**16 < x).all():
69
- df[self.date_column] = pd.to_datetime(df[self.date_column], unit="ns")
70
- elif df[self.date_column].apply(lambda x: 10**14 < x < 10**16).all():
71
- df[self.date_column] = pd.to_datetime(df[self.date_column], unit="us")
72
- elif df[self.date_column].apply(lambda x: 10**11 < x < 10**14).all():
73
- df[self.date_column] = pd.to_datetime(df[self.date_column], unit="ms")
74
- elif df[self.date_column].apply(lambda x: 0 < x < 10*11).all():
75
- df[self.date_column] = pd.to_datetime(df[self.date_column], unit="s")
76
- else:
77
- msg = f"Unsupported type of date column {self.date_column}. Convert to datetime please."
78
- self.logger.warning(msg)
79
- raise ValidationError(msg)
64
+ msg = f"Unsupported type of date column {self.date_column}. Convert to datetime please."
65
+ self.logger.warning(msg)
66
+ raise ValidationError(msg)
80
67
 
81
68
  # If column with date is datetime then extract seconds of the day and minute of the hour
82
69
  # as additional features
@@ -36,18 +36,22 @@ def _env_contains(envs) -> bool:
36
36
 
37
37
 
38
38
  def _get_execution_ide() -> str:
39
- if "google.colab" in sys.modules and _env_contains(_ide_env_variables["colab"]):
40
- return "colab"
41
- elif os.path.exists("/kaggle") and _check_installed("kaggle") and _env_contains(_ide_env_variables["kaggle"]):
42
- return "kaggle"
43
- elif getuser() == "jovyan" and _env_contains(_ide_env_variables["binder"]):
44
- return "binder"
45
- else:
39
+ try:
40
+ if "google.colab" in sys.modules and _env_contains(_ide_env_variables["colab"]):
41
+ return "colab"
42
+ elif os.path.exists("/kaggle") and _check_installed("kaggle") and _env_contains(_ide_env_variables["kaggle"]):
43
+ return "kaggle"
44
+ elif getuser() == "jovyan" and _env_contains(_ide_env_variables["binder"]):
45
+ return "binder"
46
+ elif "widget" in socket.gethostname():
47
+ return "widget"
48
+ else:
49
+ return "other"
50
+ except Exception:
46
51
  return "other"
47
52
 
48
-
49
53
  @lru_cache()
50
- def get_track_metrics(client_ip: Optional[str] = None) -> dict:
54
+ def get_track_metrics(client_ip: Optional[str] = None, client_visitorid: Optional[str] = None) -> dict:
51
55
  # default values
52
56
  track = {"ide": _get_execution_ide()}
53
57
  ident_res = "https://api.ipify.org"
@@ -66,24 +70,10 @@ def get_track_metrics(client_ip: Optional[str] = None) -> dict:
66
70
  from google.colab import output # type: ignore
67
71
  from IPython.display import Javascript, display
68
72
 
69
- # path_to_script = Path(__file__).parent.parent.resolve() / "fingerprint.js"
70
- # with open(path_to_script) as f:
71
- # js_content = f.read()
72
- # print(f"JS loaded. Length: {len(js_content)}")
73
-
74
73
  display(
75
74
  Javascript(
76
- # """
77
- # async function loadModuleFromString(code) {
78
- # const blob = new Blob([code], { type: 'application/javascript' });
79
- # const url = URL.createObjectURL(blob);
80
- # const module = await import(url);
81
- # URL.revokeObjectURL(url); // Clean URL-object after module load
82
- # return module;
83
- # }
84
- # window.visitorId = loadModuleFromString(""" + js_content + """)
85
75
  """
86
- window.visitorId = import('https://openfpcdn.io/fingerprintjs/v3')
76
+ window.visitorId = import('https://upgini.github.io/upgini/js/visitorid.js')
87
77
  .then(FingerprintJS => FingerprintJS.load())
88
78
  .then(fp => fp.get())
89
79
  .then(result => result.visitorId);
@@ -153,7 +143,10 @@ def get_track_metrics(client_ip: Optional[str] = None) -> dict:
153
143
  track["ip"] = client_ip
154
144
  else:
155
145
  track["ip"] = get(ident_res, timeout=10).text
156
- track["visitorId"] = sha256(str(getnode()).encode()).hexdigest()
146
+ if client_visitorid:
147
+ track["visitorId"] = client_visitorid
148
+ else:
149
+ track["visitorId"] = sha256(str(getnode()).encode()).hexdigest()
157
150
  except Exception as e:
158
151
  track["err"] = str(e)
159
152
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: upgini
3
- Version: 1.1.237a1
3
+ Version: 1.1.239a1
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Home-page: https://upgini.com/
6
6
  Author: Upgini Developers
@@ -823,11 +823,11 @@ Requests and support, in preferred order
823
823
  - **scoped to a Single Bug** - one bug per report.
824
824
 
825
825
  ## 🧩 Contributing
826
- We are a **very** small team and this is a part-time project for us, thus most probably we won't be able:
826
+ We are not a large team, so we probably won't be able to:
827
827
  - implement smooth integration with most common low-code ML libraries and platforms ([PyCaret](https://www.github.com/pycaret/pycaret), [H2O AutoML](https://github.com//h2oai/h2o-3/blob/master/h2o-docs/src/product/automl.rst), etc. )
828
- - implement all possible data verification and normalization capabilities for different types of search keys (we just started with current 6 types)
828
+ - implement all possible data verification and normalization capabilities for different types of search keys
829
+ And we need some help from the community!
829
830
 
830
- And we need some help from community)
831
831
  So, we'll be happy about every **pull request** you open and **issue** you find to make this library **more incredible**. Please note that it might sometimes take us a while to get back to you.
832
832
  **For major changes**, please open an issue first to discuss what you would like to change
833
833
  #### Developing
@@ -1,13 +1,13 @@
1
1
  upgini/__init__.py,sha256=asENHgEVHQBIkV-e_0IhE_ZWqkCG6398U3ZLrNzAH6k,407
2
2
  upgini/ads.py,sha256=mre6xn44wcC_fg63iLT_kTh4mViZqR9AKRJZAtpQz8Y,2592
3
- upgini/dataset.py,sha256=4u9ziFdgPOqPn-jgdG6e2GqXmjJo34DKRhSft9W_H6s,50174
3
+ upgini/dataset.py,sha256=qSjv09LKzCYayucb_JlhExw9uSRcscLWTaD8hqATE3s,49676
4
4
  upgini/errors.py,sha256=BqpvfhW2jJW5fa5KXj0alhXatGl-WK4xTl309-QNLp8,959
5
- upgini/features_enricher.py,sha256=mfm5rXHW2aesg4vmpeuq3Mz_VA05Jf70uQqwCNg-2WI,160756
5
+ upgini/features_enricher.py,sha256=9RJi8NwYbXPK-vgWiMcYoD4I2wO0D91Uk-tvL_1nJ-8,160271
6
6
  upgini/fingerprint.js,sha256=VygVIQlN1v4NGZfjHqtRogOw8zjTnnMNJg_f7M5iGQU,33442
7
- upgini/http.py,sha256=HzUSZudCdISJGUqHC1gAT1v_x1n_dIFVDJW4z3Q7DCs,41204
7
+ upgini/http.py,sha256=RG93QmV3mqKixQsSHqYeM1Mtucp-EpdavcpCuhufnGE,42141
8
8
  upgini/metadata.py,sha256=FZ5CQluLLWrfrBVThSIes1SW6wcs7n50aNZwzYnHiF0,9584
9
9
  upgini/metrics.py,sha256=YeYHJtEIs8OG-EzidG-nbSYB919pjZ4MMbdcZ_jfV2s,23639
10
- upgini/search_task.py,sha256=7YxH1zrUHMmePO0VbPBBCJjeoer7jAC0Gltc9EVAOIg,17126
10
+ upgini/search_task.py,sha256=sqgb5MfwWXg6YAbVhLOPcVJ5tDCUyzxFRWfd9aWj8SM,17236
11
11
  upgini/spinner.py,sha256=yhakBaydMNS8E8TRAwTdCMdnWrHeWT0cR1M8c9hP6jA,1157
12
12
  upgini/version_validator.py,sha256=rDIncP6BEko4J2F2hUcMOtKm_vZbI4ICWcNcw8hrwM4,1400
13
13
  upgini/ads_management/__init__.py,sha256=qzyisOToVRP-tquAJD1PblZhNtMrOB8FiyF9JvfkvgE,50
@@ -17,18 +17,18 @@ upgini/autofe/all_operands.py,sha256=du44N6ISWe3ikb0y9ZzSOHNbLiyEYrJPwoBo0Z6xp2s
17
17
  upgini/autofe/binary.py,sha256=f8LQqZi9zyaMUAv-jASMmWNA_vT05ncYCjZq0qx3USs,3972
18
18
  upgini/autofe/feature.py,sha256=d_iikjQJYgTOkZrXON_IWY5S22OkSpCsk6lfbmVA9ts,11825
19
19
  upgini/autofe/groupby.py,sha256=iXRfOmOc84ooSzRhsh9GmmG7rTafX0-ekXko8s9Qs68,3089
20
- upgini/autofe/operand.py,sha256=8WqEoSIA5rEWCK1xuC303E4NW5a72GZ5jUMAEj4skII,2291
21
- upgini/autofe/unary.py,sha256=7TBe7PCt7l_XQEqu_G5g_TC2cW3tppL7uPDcX8xsqz0,2731
20
+ upgini/autofe/operand.py,sha256=GpSx-nL2XKnTJ7kvRr_SIFoUMchqYian6SftJ82zsN4,2719
21
+ upgini/autofe/unary.py,sha256=WB-Ovwaz2a-Jscpshg1Om7Ttx6DJ6gQ_fgqtXx_UHuw,2845
22
22
  upgini/autofe/vector.py,sha256=Qk7VmdwURNwVw7fIMEspWEo7HTiyUWCYIqu3hcWQQio,507
23
23
  upgini/data_source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
24
- upgini/data_source/data_source_publisher.py,sha256=zFu0WMKwPM11gPZHq8dpsBP7s4wmTtBqYoDEakgNxoY,13725
24
+ upgini/data_source/data_source_publisher.py,sha256=xvHi4N4m32eqB_h_qtY1wAt1dXekM5PdNL2T9JzFQD4,14051
25
25
  upgini/mdc/__init__.py,sha256=CuKmWYCqAnmiq1S7wgMzJhSCTsXuoeiZWXSfzw0lyig,1152
26
26
  upgini/mdc/context.py,sha256=eVNEubcgkiAP139Vna2qtUBZJWoy15rWWAuB0TFv54E,1484
27
27
  upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
28
28
  upgini/normalizer/phone_normalizer.py,sha256=VIgLXuDuzzjPEXiy_LyDVLZKGaS7-le6Fh6T4D-TQDU,9930
29
29
  upgini/resource_bundle/__init__.py,sha256=M7GtS7KPQw9pinz8P2aQWXpSkD2YFwUPVGk1w92Pn84,7888
30
30
  upgini/resource_bundle/exceptions.py,sha256=KT-OnqA2J4OTfLjhbEl3KFZM2ci7EOPjqJuY_rXp3vs,622
31
- upgini/resource_bundle/strings.properties,sha256=2Lad26Y4spPt_i5EYfOPg5XInBU7CuQxH8mDUWKzbDo,24829
31
+ upgini/resource_bundle/strings.properties,sha256=1mpOkd_wkKIJGwWRBgfXz0mLx4lqdDro5IUoj8BBxuE,24527
32
32
  upgini/sampler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
33
33
  upgini/sampler/base.py,sha256=X2PVsfZ3Rl7twpFDh5UWyxqY2K_jcMGxZ2NcHLwFRj4,6489
34
34
  upgini/sampler/random_under_sampler.py,sha256=whX_f_TtalHH8Seyn_7n3sX_TSiDHeYfALmme9saqDg,4082
@@ -39,8 +39,7 @@ upgini/utils/blocked_time_series.py,sha256=dMz5ewk3PsoeOrc3lDzInCVPS9u_2XQkV0W6P
39
39
  upgini/utils/country_utils.py,sha256=9BXSXoGm3nVoOZE_bRENY-KMkwMUFvAF3Au0zxUNA1o,6436
40
40
  upgini/utils/custom_loss_utils.py,sha256=DBslpjWGPt7xTeypt78baR59012SYphbPsO_YLKdilo,3972
41
41
  upgini/utils/cv_utils.py,sha256=6pSSL_Ft_8C6n6aInJeiyeSBD7McjsMxKZpHqSBV0uY,2491
42
- upgini/utils/datetime_utils.py,sha256=awsLpnFjBNcrsCDyyiiJLicHgHiGCNAwi0UOwRKGD7s,8645
43
- upgini/utils/deduplicate_utils.py,sha256=GoZaMslZzVOaHiRHEVkznWuH55OwKKOxcIKsDAgzpBM,2728
42
+ upgini/utils/datetime_utils.py,sha256=P56e7gcgAogJYfs2Blzk1uypxb9yrFzNaeJpMCRm6Zc,7716
44
43
  upgini/utils/display_utils.py,sha256=tiq5sFOfMwkKCjQ7OGdyK_twe0Qdr9F3mzkW1QXSDog,10664
45
44
  upgini/utils/email_utils.py,sha256=MhCLUAWqbp81xRyKizauNhVx6t_MFeJQRQ8pFM7EpFo,3480
46
45
  upgini/utils/fallback_progress_bar.py,sha256=f-VzVbiO6oU9WoKzEgoegYotixdiKanGlvdQCOGC-NY,1128
@@ -52,10 +51,10 @@ upgini/utils/postal_code_utils.py,sha256=_8CR9tBqsPptQsmMUvnrCAmBaMIQSWH3JfJ4ly3
52
51
  upgini/utils/progress_bar.py,sha256=iNXyqT3vKCeHpfiG5HHwr7Lk2cTtKViM93Fl8iZnjGc,1564
53
52
  upgini/utils/sklearn_ext.py,sha256=IMx2La70AXAggApVpT7sMEjWqVWon5AMZt4MARDsIMQ,43847
54
53
  upgini/utils/target_utils.py,sha256=cu52icjhDIPpEStHYMXrD2hIl9gzvfnxZr0Ra5osV0k,1616
55
- upgini/utils/track_info.py,sha256=DVNVZmXUb4f25DSPEuUNEFx49hNEBfmuY9iSW5jkMnI,5708
54
+ upgini/utils/track_info.py,sha256=jPOiIGpAG_zvHgeiFe_pQ4TWC9ZPjnd_5hSOu5tzLi4,5207
56
55
  upgini/utils/warning_counter.py,sha256=vnmdFo5-7GBkU2bK9h_uC0K0Y_wtfcYstxOdeRfacO0,228
57
- upgini-1.1.237a1.dist-info/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
58
- upgini-1.1.237a1.dist-info/METADATA,sha256=JN2NeYkcwMCKWMS68wvKqpI4YvJahinoKi_M9jq1kiw,48346
59
- upgini-1.1.237a1.dist-info/WHEEL,sha256=G16H4A3IeoQmnOrYV4ueZGKSjhipXx8zc8nu9FGlvMA,92
60
- upgini-1.1.237a1.dist-info/top_level.txt,sha256=OFhTGiDIWKl5gFI49qvWq1R9IKflPaE2PekcbDXDtx4,7
61
- upgini-1.1.237a1.dist-info/RECORD,,
56
+ upgini-1.1.239a1.dist-info/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
57
+ upgini-1.1.239a1.dist-info/METADATA,sha256=Sl4XSdmxJTR9080xw55QKFkoMDFMHuspXT_54E07mm0,48264
58
+ upgini-1.1.239a1.dist-info/WHEEL,sha256=G16H4A3IeoQmnOrYV4ueZGKSjhipXx8zc8nu9FGlvMA,92
59
+ upgini-1.1.239a1.dist-info/top_level.txt,sha256=OFhTGiDIWKl5gFI49qvWq1R9IKflPaE2PekcbDXDtx4,7
60
+ upgini-1.1.239a1.dist-info/RECORD,,
@@ -1,71 +0,0 @@
1
- from logging import Logger
2
- from typing import Dict, List, Optional, Union
3
-
4
- import pandas as pd
5
-
6
- from upgini.metadata import TARGET, ModelTaskType, SearchKey
7
- from upgini.resource_bundle import bundle
8
- from upgini.utils.datetime_utils import DateTimeSearchKeyConverter
9
- from upgini.utils.target_utils import define_task
10
-
11
-
12
- def remove_fintech_duplicates(df: pd.DataFrame,
13
- search_keys: Dict[str, SearchKey],
14
- logger: Optional[Logger] = None) -> pd.DataFrame:
15
- if define_task(df.target, silent=True) != ModelTaskType.BINARY:
16
- return df
17
-
18
- date_col = _get_column_by_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
19
- if date_col is None:
20
- return df
21
-
22
- personal_cols = []
23
- phone_col = _get_column_by_key(search_keys, SearchKey.PHONE)
24
- if phone_col:
25
- personal_cols.append(phone_col)
26
- email_col = _get_column_by_key(search_keys, SearchKey.EMAIL)
27
- if email_col:
28
- personal_cols.append(email_col)
29
- hem_col = _get_column_by_key(search_keys, SearchKey.HEM)
30
- if hem_col:
31
- personal_cols.append(hem_col)
32
- if len(personal_cols) == 0:
33
- return df
34
-
35
- duplicates = df.duplicated(personal_cols, keep=False)
36
- duplicate_rows = df[duplicates]
37
- if len(duplicate_rows) == 0:
38
- return df
39
-
40
- grouped_by_personal_cols = df.groupby(personal_cols, group_keys=False)
41
-
42
- uniques = grouped_by_personal_cols[date_col].nunique()
43
- total = len(uniques)
44
- diff_dates = len(uniques[uniques > 1])
45
- if diff_dates / total >= 0.6:
46
- return df
47
-
48
- if grouped_by_personal_cols[TARGET].apply(lambda x: len(x.unique()) == 1).all():
49
- return df
50
-
51
- def has_diff_target_within_60_days(rows):
52
- rows = DateTimeSearchKeyConverter(date_col).convert(rows)
53
- rows = rows.sort_values(by=date_col)
54
- return len(rows[rows[TARGET].ne(rows[TARGET].shift()) & (rows[date_col].diff() < 60 * 24 * 60 * 60 * 1000)]) > 0
55
-
56
- rows_with_diff_target = grouped_by_personal_cols.filter(has_diff_target_within_60_days)
57
- if len(rows_with_diff_target) > 0:
58
- perc = len(rows_with_diff_target) * 100 / len(df)
59
- msg = bundle.get("dataset_diff_target_duplicates_fintech").format(perc, len(rows_with_diff_target), rows_with_diff_target.index.to_list())
60
- print(msg)
61
- if logger:
62
- logger.warning(msg)
63
- df = df[~df.index.isin(rows_with_diff_target.index)]
64
-
65
- return df
66
-
67
-
68
- def _get_column_by_key(search_keys: Dict[str, SearchKey], keys: Union[SearchKey, List[SearchKey]]) -> Optional[str]:
69
- for col, key_type in search_keys.items():
70
- if (isinstance(keys, list) and key_type in keys) or key_type == keys:
71
- return col