upgini 1.1.242a3__tar.gz → 1.1.244a1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

Files changed (81) hide show
  1. {upgini-1.1.242a3/src/upgini.egg-info → upgini-1.1.244a1}/PKG-INFO +1 -1
  2. {upgini-1.1.242a3 → upgini-1.1.244a1}/setup.py +1 -1
  3. {upgini-1.1.242a3 → upgini-1.1.244a1}/src/upgini/ads_management/ads_manager.py +0 -1
  4. {upgini-1.1.242a3 → upgini-1.1.244a1}/src/upgini/autofe/feature.py +12 -5
  5. {upgini-1.1.242a3 → upgini-1.1.244a1}/src/upgini/data_source/data_source_publisher.py +1 -6
  6. {upgini-1.1.242a3 → upgini-1.1.244a1}/src/upgini/dataset.py +20 -4
  7. {upgini-1.1.242a3 → upgini-1.1.244a1}/src/upgini/errors.py +0 -1
  8. {upgini-1.1.242a3 → upgini-1.1.244a1}/src/upgini/features_enricher.py +31 -38
  9. {upgini-1.1.242a3 → upgini-1.1.244a1}/src/upgini/http.py +24 -14
  10. {upgini-1.1.242a3 → upgini-1.1.244a1}/src/upgini/mdc/__init__.py +1 -2
  11. {upgini-1.1.242a3 → upgini-1.1.244a1}/src/upgini/mdc/context.py +1 -5
  12. {upgini-1.1.242a3 → upgini-1.1.244a1}/src/upgini/normalizer/phone_normalizer.py +3 -4
  13. {upgini-1.1.242a3 → upgini-1.1.244a1}/src/upgini/resource_bundle/exceptions.py +0 -1
  14. {upgini-1.1.242a3 → upgini-1.1.244a1}/src/upgini/resource_bundle/strings.properties +1 -0
  15. {upgini-1.1.242a3 → upgini-1.1.244a1}/src/upgini/sampler/base.py +3 -9
  16. {upgini-1.1.242a3 → upgini-1.1.244a1}/src/upgini/sampler/random_under_sampler.py +1 -3
  17. {upgini-1.1.242a3 → upgini-1.1.244a1}/src/upgini/search_task.py +4 -10
  18. {upgini-1.1.242a3 → upgini-1.1.244a1}/src/upgini/spinner.py +1 -7
  19. {upgini-1.1.242a3 → upgini-1.1.244a1}/src/upgini/utils/country_utils.py +3 -1
  20. {upgini-1.1.242a3 → upgini-1.1.244a1}/src/upgini/utils/datetime_utils.py +16 -3
  21. upgini-1.1.244a1/src/upgini/utils/deduplicate_utils.py +82 -0
  22. {upgini-1.1.242a3 → upgini-1.1.244a1}/src/upgini/utils/email_utils.py +0 -1
  23. {upgini-1.1.242a3 → upgini-1.1.244a1}/src/upgini/utils/fallback_progress_bar.py +5 -8
  24. {upgini-1.1.242a3 → upgini-1.1.244a1}/src/upgini/utils/warning_counter.py +0 -1
  25. {upgini-1.1.242a3 → upgini-1.1.244a1/src/upgini.egg-info}/PKG-INFO +1 -1
  26. {upgini-1.1.242a3 → upgini-1.1.244a1}/src/upgini.egg-info/SOURCES.txt +1 -0
  27. {upgini-1.1.242a3 → upgini-1.1.244a1}/tests/test_datetime_utils.py +36 -30
  28. {upgini-1.1.242a3 → upgini-1.1.244a1}/tests/test_email_utils.py +1 -1
  29. {upgini-1.1.242a3 → upgini-1.1.244a1}/tests/test_etalon_validation.py +13 -12
  30. {upgini-1.1.242a3 → upgini-1.1.244a1}/tests/test_widget.py +1 -1
  31. {upgini-1.1.242a3 → upgini-1.1.244a1}/LICENSE +0 -0
  32. {upgini-1.1.242a3 → upgini-1.1.244a1}/README.md +0 -0
  33. {upgini-1.1.242a3 → upgini-1.1.244a1}/pyproject.toml +0 -0
  34. {upgini-1.1.242a3 → upgini-1.1.244a1}/setup.cfg +0 -0
  35. {upgini-1.1.242a3 → upgini-1.1.244a1}/src/upgini/__init__.py +0 -0
  36. {upgini-1.1.242a3 → upgini-1.1.244a1}/src/upgini/ads.py +0 -0
  37. {upgini-1.1.242a3 → upgini-1.1.244a1}/src/upgini/ads_management/__init__.py +0 -0
  38. {upgini-1.1.242a3 → upgini-1.1.244a1}/src/upgini/autofe/__init__.py +0 -0
  39. {upgini-1.1.242a3 → upgini-1.1.244a1}/src/upgini/autofe/all_operands.py +0 -0
  40. {upgini-1.1.242a3 → upgini-1.1.244a1}/src/upgini/autofe/binary.py +0 -0
  41. {upgini-1.1.242a3 → upgini-1.1.244a1}/src/upgini/autofe/groupby.py +0 -0
  42. {upgini-1.1.242a3 → upgini-1.1.244a1}/src/upgini/autofe/operand.py +0 -0
  43. {upgini-1.1.242a3 → upgini-1.1.244a1}/src/upgini/autofe/unary.py +0 -0
  44. {upgini-1.1.242a3 → upgini-1.1.244a1}/src/upgini/autofe/vector.py +0 -0
  45. {upgini-1.1.242a3 → upgini-1.1.244a1}/src/upgini/data_source/__init__.py +0 -0
  46. {upgini-1.1.242a3 → upgini-1.1.244a1}/src/upgini/fingerprint.js +0 -0
  47. {upgini-1.1.242a3 → upgini-1.1.244a1}/src/upgini/metadata.py +0 -0
  48. {upgini-1.1.242a3 → upgini-1.1.244a1}/src/upgini/metrics.py +0 -0
  49. {upgini-1.1.242a3 → upgini-1.1.244a1}/src/upgini/normalizer/__init__.py +0 -0
  50. {upgini-1.1.242a3 → upgini-1.1.244a1}/src/upgini/resource_bundle/__init__.py +0 -0
  51. {upgini-1.1.242a3 → upgini-1.1.244a1}/src/upgini/sampler/__init__.py +0 -0
  52. {upgini-1.1.242a3 → upgini-1.1.244a1}/src/upgini/sampler/utils.py +0 -0
  53. {upgini-1.1.242a3 → upgini-1.1.244a1}/src/upgini/utils/__init__.py +0 -0
  54. {upgini-1.1.242a3 → upgini-1.1.244a1}/src/upgini/utils/base_search_key_detector.py +0 -0
  55. {upgini-1.1.242a3 → upgini-1.1.244a1}/src/upgini/utils/blocked_time_series.py +0 -0
  56. {upgini-1.1.242a3 → upgini-1.1.244a1}/src/upgini/utils/custom_loss_utils.py +0 -0
  57. {upgini-1.1.242a3 → upgini-1.1.244a1}/src/upgini/utils/cv_utils.py +0 -0
  58. {upgini-1.1.242a3 → upgini-1.1.244a1}/src/upgini/utils/display_utils.py +0 -0
  59. {upgini-1.1.242a3 → upgini-1.1.244a1}/src/upgini/utils/features_validator.py +0 -0
  60. {upgini-1.1.242a3 → upgini-1.1.244a1}/src/upgini/utils/format.py +0 -0
  61. {upgini-1.1.242a3 → upgini-1.1.244a1}/src/upgini/utils/ip_utils.py +1 -1
  62. {upgini-1.1.242a3 → upgini-1.1.244a1}/src/upgini/utils/phone_utils.py +0 -0
  63. {upgini-1.1.242a3 → upgini-1.1.244a1}/src/upgini/utils/postal_code_utils.py +0 -0
  64. {upgini-1.1.242a3 → upgini-1.1.244a1}/src/upgini/utils/progress_bar.py +0 -0
  65. {upgini-1.1.242a3 → upgini-1.1.244a1}/src/upgini/utils/sklearn_ext.py +0 -0
  66. {upgini-1.1.242a3 → upgini-1.1.244a1}/src/upgini/utils/target_utils.py +0 -0
  67. {upgini-1.1.242a3 → upgini-1.1.244a1}/src/upgini/utils/track_info.py +0 -0
  68. {upgini-1.1.242a3 → upgini-1.1.244a1}/src/upgini/version_validator.py +0 -0
  69. {upgini-1.1.242a3 → upgini-1.1.244a1}/src/upgini.egg-info/dependency_links.txt +0 -0
  70. {upgini-1.1.242a3 → upgini-1.1.244a1}/src/upgini.egg-info/requires.txt +0 -0
  71. {upgini-1.1.242a3 → upgini-1.1.244a1}/src/upgini.egg-info/top_level.txt +0 -0
  72. {upgini-1.1.242a3 → upgini-1.1.244a1}/tests/test_binary_dataset.py +0 -0
  73. {upgini-1.1.242a3 → upgini-1.1.244a1}/tests/test_blocked_time_series.py +0 -0
  74. {upgini-1.1.242a3 → upgini-1.1.244a1}/tests/test_categorical_dataset.py +0 -0
  75. {upgini-1.1.242a3 → upgini-1.1.244a1}/tests/test_continuous_dataset.py +0 -0
  76. {upgini-1.1.242a3 → upgini-1.1.244a1}/tests/test_country_utils.py +0 -0
  77. {upgini-1.1.242a3 → upgini-1.1.244a1}/tests/test_custom_loss_utils.py +0 -0
  78. {upgini-1.1.242a3 → upgini-1.1.244a1}/tests/test_features_enricher.py +0 -0
  79. {upgini-1.1.242a3 → upgini-1.1.244a1}/tests/test_metrics.py +0 -0
  80. {upgini-1.1.242a3 → upgini-1.1.244a1}/tests/test_phone_utils.py +0 -0
  81. {upgini-1.1.242a3 → upgini-1.1.244a1}/tests/test_postal_code_utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: upgini
3
- Version: 1.1.242a3
3
+ Version: 1.1.244a1
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Home-page: https://upgini.com/
6
6
  Author: Upgini Developers
@@ -40,7 +40,7 @@ def send_log(msg: str):
40
40
 
41
41
 
42
42
  here = Path(__file__).parent.resolve()
43
- version = "1.1.242a3"
43
+ version = "1.1.244a1"
44
44
  try:
45
45
  send_log(f"Start setup PyLib version {version}")
46
46
  setup(
@@ -7,7 +7,6 @@ import pandas as pd
7
7
 
8
8
 
9
9
  class AdsManager:
10
-
11
10
  FINAL_STATUSES = ["COMPLETED", "FAILED", "TIMED_OUT"]
12
11
 
13
12
  def __init__(self, api_key: Optional[str] = None, backend_url: Optional[str] = None):
@@ -53,9 +53,15 @@ class Column:
53
53
 
54
54
 
55
55
  class Feature:
56
- def __init__(self, op: Operand, children: List[Union[Column, "Feature"]], data: Optional[pd.DataFrame] = None,
57
- display_index: Optional[str] = None, cached_display_name: Optional[str] = None,
58
- alias: Optional[str] = None):
56
+ def __init__(
57
+ self,
58
+ op: Operand,
59
+ children: List[Union[Column, "Feature"]],
60
+ data: Optional[pd.DataFrame] = None,
61
+ display_index: Optional[str] = None,
62
+ cached_display_name: Optional[str] = None,
63
+ alias: Optional[str] = None,
64
+ ):
59
65
  self.op = op
60
66
  self.children = children
61
67
  self.data = data
@@ -258,8 +264,9 @@ class Feature:
258
264
 
259
265
 
260
266
  class FeatureGroup:
261
- def __init__(self, op: Operand, main_column: Optional[Union[Column, Feature]],
262
- children: List[Union[Column, Feature]]):
267
+ def __init__(
268
+ self, op: Operand, main_column: Optional[Union[Column, Feature]], children: List[Union[Column, Feature]]
269
+ ):
263
270
  self.op = op
264
271
  self.main_column_node = main_column
265
272
  self.children = children
@@ -31,7 +31,6 @@ class OnlineUploadingType(Enum):
31
31
 
32
32
 
33
33
  class DataSourcePublisher:
34
-
35
34
  FINAL_STATUSES = ["COMPLETED", "FAILED", "TIMED_OUT"]
36
35
  DEFAULT_GENERATE_EMBEDDINGS = []
37
36
 
@@ -259,11 +258,7 @@ class DataSourcePublisher:
259
258
  except Exception:
260
259
  self.logger.exception(f"Failed to deactivate data tables {data_table_ids} for clients {client_emails}")
261
260
 
262
- def upload_online(
263
- self,
264
- bq_table_id: Optional[str] = None,
265
- search_keys: Optional[List[SearchKey]] = None
266
- ):
261
+ def upload_online(self, bq_table_id: Optional[str] = None, search_keys: Optional[List[SearchKey]] = None):
267
262
  trace_id = str(uuid.uuid4())
268
263
  with MDC(trace_id=trace_id):
269
264
  if bq_table_id is None and search_keys is None:
@@ -36,12 +36,14 @@ from upgini.metadata import (
36
36
  NumericInterval,
37
37
  RuntimeParameters,
38
38
  SearchCustomization,
39
+ SearchKey,
39
40
  )
40
41
  from upgini.normalizer.phone_normalizer import PhoneNormalizer
41
42
  from upgini.resource_bundle import bundle
42
43
  from upgini.sampler.random_under_sampler import RandomUnderSampler
43
44
  from upgini.search_task import SearchTask
44
45
  from upgini.utils import combine_search_keys
46
+ from upgini.utils.deduplicate_utils import remove_fintech_duplicates
45
47
  from upgini.utils.email_utils import EmailSearchKeyConverter
46
48
 
47
49
  try:
@@ -346,9 +348,11 @@ class Dataset: # (pd.DataFrame):
346
348
 
347
349
  ipv6 = ip + "_v6"
348
350
  self.data[ipv6] = (
349
- self.data[ip].apply(self._to_ipv6)
350
- .apply(self.__ip_to_int)
351
- .astype("string").str.replace(".0", "", regex=False)
351
+ self.data[ip]
352
+ .apply(self._to_ipv6)
353
+ .apply(self.__ip_to_int)
354
+ .astype("string")
355
+ .str.replace(".0", "", regex=False)
352
356
  )
353
357
  self.data = self.data.drop(columns=ip)
354
358
  self.meaning_types[ipv6] = FileColumnMeaningType.IPV6_ADDRESS
@@ -811,7 +815,19 @@ class Dataset: # (pd.DataFrame):
811
815
 
812
816
  self.__convert_features_types()
813
817
 
814
- self.__clean_duplicates(silent_mode)
818
+ search_keys = {
819
+ col: SearchKey.from_meaning_type(key_type)
820
+ for col, key_type in self.meaning_types.items()
821
+ if SearchKey.from_meaning_type(key_type) is not None
822
+ }
823
+
824
+ if validate_target:
825
+ need_full_defuplication, self.data = remove_fintech_duplicates(self.data, search_keys, self.logger)
826
+ else:
827
+ need_full_defuplication = True
828
+
829
+ if need_full_defuplication:
830
+ self.__clean_duplicates(silent_mode)
815
831
 
816
832
  self.__validate_dataset(validate_target, silent_mode)
817
833
 
@@ -26,6 +26,5 @@ class UpginiConnectionError(Exception):
26
26
 
27
27
 
28
28
  class ValidationError(Exception):
29
-
30
29
  def __init__(self, message):
31
30
  super(ValidationError, self).__init__(message)
@@ -64,6 +64,7 @@ from upgini.utils.datetime_utils import (
64
64
  is_blocked_time_series,
65
65
  is_time_series,
66
66
  )
67
+ from upgini.utils.deduplicate_utils import remove_fintech_duplicates
67
68
  from upgini.utils.display_utils import (
68
69
  display_html_dataframe,
69
70
  do_without_pandas_limits,
@@ -297,8 +298,9 @@ class FeaturesEnricher(TransformerMixin):
297
298
  def _set_api_key(self, api_key: str):
298
299
  self._api_key = api_key
299
300
  if self.logs_enabled:
300
- self.logger = LoggerFactory().get_logger(self.endpoint, self._api_key,
301
- self.client_ip, self.client_visitorid)
301
+ self.logger = LoggerFactory().get_logger(
302
+ self.endpoint, self._api_key, self.client_ip, self.client_visitorid
303
+ )
302
304
 
303
305
  api_key = property(_get_api_key, _set_api_key)
304
306
 
@@ -856,7 +858,7 @@ class FeaturesEnricher(TransformerMixin):
856
858
 
857
859
  if X is not None and y is None:
858
860
  raise ValidationError("X passed without y")
859
-
861
+
860
862
  effective_X = X if X is not None else self.X
861
863
  effective_eval_set = eval_set if eval_set is not None else self.eval_set
862
864
 
@@ -1200,8 +1202,8 @@ class FeaturesEnricher(TransformerMixin):
1200
1202
  converter = DateTimeSearchKeyConverter(date_column, self.date_format, self.logger)
1201
1203
  extended_X = converter.convert(extended_X, keep_time=True)
1202
1204
  generated_features.extend(converter.generated_features)
1203
- email_column = self.__get_email_column(search_keys)
1204
- hem_column = self.__get_hem_column(search_keys)
1205
+ email_column = self._get_email_column(search_keys)
1206
+ hem_column = self._get_hem_column(search_keys)
1205
1207
  if email_column:
1206
1208
  converter = EmailSearchKeyConverter(email_column, hem_column, search_keys, self.logger)
1207
1209
  extended_X = converter.convert(extended_X)
@@ -1469,7 +1471,7 @@ class FeaturesEnricher(TransformerMixin):
1469
1471
 
1470
1472
  original_df_sampled = self.df_with_original_index[
1471
1473
  self.df_with_original_index[SYSTEM_RECORD_ID].isin(fit_features[SYSTEM_RECORD_ID])
1472
- ]
1474
+ ]
1473
1475
  enriched_X = drop_existing_columns(enriched_Xy, TARGET)
1474
1476
  if EVAL_SET_INDEX in original_df_sampled.columns:
1475
1477
  Xy_sampled = original_df_sampled.query(f"{EVAL_SET_INDEX} == 0")
@@ -1525,6 +1527,10 @@ class FeaturesEnricher(TransformerMixin):
1525
1527
  eval_df_with_index[EVAL_SET_INDEX] = idx + 1
1526
1528
  df_with_eval_set_index = pd.concat([df_with_eval_set_index, eval_df_with_index])
1527
1529
 
1530
+ _, df_with_eval_set_index = remove_fintech_duplicates(
1531
+ df_with_eval_set_index, self.search_keys, self.logger, silent=True
1532
+ )
1533
+
1528
1534
  # downsample if need to eval_set threshold
1529
1535
  num_samples = _num_samples(df_with_eval_set_index)
1530
1536
  if num_samples > Dataset.FIT_SAMPLE_WITH_EVAL_SET_THRESHOLD:
@@ -1534,9 +1540,7 @@ class FeaturesEnricher(TransformerMixin):
1534
1540
  )
1535
1541
 
1536
1542
  X_sampled = (
1537
- df_with_eval_set_index.query(f"{EVAL_SET_INDEX} == 0")
1538
- .copy()
1539
- .drop(columns=[EVAL_SET_INDEX, TARGET])
1543
+ df_with_eval_set_index.query(f"{EVAL_SET_INDEX} == 0").copy().drop(columns=[EVAL_SET_INDEX, TARGET])
1540
1544
  )
1541
1545
  X_sampled, search_keys = self._extend_x(X_sampled, is_demo_dataset)
1542
1546
  y_sampled = df_with_eval_set_index.query(f"{EVAL_SET_INDEX} == 0").copy()[TARGET]
@@ -1760,8 +1764,8 @@ class FeaturesEnricher(TransformerMixin):
1760
1764
  generated_features.extend(converter.generated_features)
1761
1765
  else:
1762
1766
  self.logger.info("Input dataset hasn't date column")
1763
- email_column = self.__get_email_column(search_keys)
1764
- hem_column = self.__get_hem_column(search_keys)
1767
+ email_column = self._get_email_column(search_keys)
1768
+ hem_column = self._get_hem_column(search_keys)
1765
1769
  email_converted_to_hem = False
1766
1770
  if email_column:
1767
1771
  converter = EmailSearchKeyConverter(email_column, hem_column, search_keys, self.logger)
@@ -1883,9 +1887,7 @@ class FeaturesEnricher(TransformerMixin):
1883
1887
  progress = self.get_progress(trace_id, validation_task)
1884
1888
  except KeyboardInterrupt as e:
1885
1889
  print(bundle.get("search_stopping"))
1886
- self.rest_client.stop_search_task_v2(
1887
- trace_id, validation_task.search_task_id
1888
- )
1890
+ self.rest_client.stop_search_task_v2(trace_id, validation_task.search_task_id)
1889
1891
  self.logger.warning(f"Search {validation_task.search_task_id} stopped by user")
1890
1892
  print(bundle.get("search_stopped"))
1891
1893
  raise e
@@ -2098,8 +2100,8 @@ class FeaturesEnricher(TransformerMixin):
2098
2100
  self.fit_generated_features.extend(converter.generated_features)
2099
2101
  else:
2100
2102
  self.logger.info("Input dataset hasn't date column")
2101
- email_column = self.__get_email_column(self.fit_search_keys)
2102
- hem_column = self.__get_hem_column(self.fit_search_keys)
2103
+ email_column = self._get_email_column(self.fit_search_keys)
2104
+ hem_column = self._get_hem_column(self.fit_search_keys)
2103
2105
  email_converted_to_hem = False
2104
2106
  if email_column:
2105
2107
  converter = EmailSearchKeyConverter(email_column, hem_column, self.fit_search_keys, self.logger)
@@ -2481,21 +2483,6 @@ class FeaturesEnricher(TransformerMixin):
2481
2483
  raise ValidationError(bundle.get("y_is_constant_eval_set"))
2482
2484
 
2483
2485
  return validated_eval_X, validated_eval_y
2484
-
2485
- def _validate_baseline_score(self, X: pd.DataFrame, eval_set: Optional[List[Tuple]]):
2486
- if self.baseline_score_column is not None:
2487
- if self.baseline_score_column not in X.columns:
2488
- raise ValidationError(bundle.get("baseline_score_column_not_exists").format(self.baseline_score_column))
2489
- if X[self.baseline_score_column].isna().any():
2490
- raise ValidationError(bundle.get("baseline_score_column_has_na"))
2491
- if eval_set is not None:
2492
- if isinstance(eval_set, tuple):
2493
- eval_set = [eval_set]
2494
- for eval in eval_set:
2495
- if self.baseline_score_column not in eval[0].columns:
2496
- raise ValidationError(bundle.get("baseline_score_column_not_exists"))
2497
- if eval[0][self.baseline_score_column].isna().any():
2498
- raise ValidationError(bundle.get("baseline_score_column_has_na"))
2499
2486
 
2500
2487
  def _validate_baseline_score(self, X: pd.DataFrame, eval_set: Optional[List[Tuple]]):
2501
2488
  if self.baseline_score_column is not None:
@@ -2660,17 +2647,23 @@ class FeaturesEnricher(TransformerMixin):
2660
2647
  return [col for col, t in search_keys.items() if t not in [SearchKey.DATE, SearchKey.DATETIME]]
2661
2648
 
2662
2649
  @staticmethod
2663
- def __get_email_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
2650
+ def _get_email_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
2664
2651
  for col, t in search_keys.items():
2665
2652
  if t == SearchKey.EMAIL:
2666
2653
  return col
2667
2654
 
2668
2655
  @staticmethod
2669
- def __get_hem_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
2656
+ def _get_hem_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
2670
2657
  for col, t in search_keys.items():
2671
2658
  if t == SearchKey.HEM:
2672
2659
  return col
2673
2660
 
2661
+ @staticmethod
2662
+ def _get_phone_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
2663
+ for col, t in search_keys.items():
2664
+ if t == SearchKey.PHONE:
2665
+ return col
2666
+
2674
2667
  def __add_fit_system_record_id(
2675
2668
  self, df: pd.DataFrame, meaning_types: Dict[str, FileColumnMeaningType], search_keys: Dict[str, SearchKey]
2676
2669
  ) -> pd.DataFrame:
@@ -2785,9 +2778,9 @@ class FeaturesEnricher(TransformerMixin):
2785
2778
  result_features.index.name = original_index_name
2786
2779
 
2787
2780
  if rows_to_drop is not None:
2788
- print(f"Before dropping target outliers size: {len(result_features)}")
2781
+ self.logger.info(f"Before dropping target outliers size: {len(result_features)}")
2789
2782
  result_features = result_features[~result_features[SYSTEM_RECORD_ID].isin(rows_to_drop[SYSTEM_RECORD_ID])]
2790
- print(f"After dropping target outliers size: {len(result_features)}")
2783
+ self.logger.info(f"After dropping target outliers size: {len(result_features)}")
2791
2784
 
2792
2785
  result_eval_sets = dict()
2793
2786
  if not is_transform and EVAL_SET_INDEX in result_features.columns:
@@ -2995,9 +2988,9 @@ class FeaturesEnricher(TransformerMixin):
2995
2988
  self.logger.warning(f"Feature meta for display index {m.display_index} not found")
2996
2989
  continue
2997
2990
  description["shap"] = feature_meta.shap_value
2998
- description["Sources"] = feature_meta.data_source\
2999
- .replace("AutoFE: features from ", "")\
3000
- .replace("AutoFE: feature from ", "")
2991
+ description["Sources"] = feature_meta.data_source.replace("AutoFE: features from ", "").replace(
2992
+ "AutoFE: feature from ", ""
2993
+ )
3001
2994
  description["Feature name"] = feature_meta.name
3002
2995
 
3003
2996
  feature_idx = 1
@@ -308,7 +308,6 @@ class _RestClient:
308
308
  # self.silent_mode = silent_mode
309
309
  self.client_ip = client_ip
310
310
  self.client_visitorid = client_visitorid
311
- print(f"Created RestClient with {client_ip} and {client_visitorid}")
312
311
  self._access_token = self._refresh_access_token()
313
312
  # self._access_token: Optional[str] = None # self._refresh_access_token()
314
313
  self.last_refresh_time = time.time()
@@ -442,9 +441,7 @@ class _RestClient:
442
441
  ) -> SearchTaskResponse:
443
442
  api_path = self.INITIAL_SEARCH_URI_FMT_V2
444
443
 
445
- print(f"Start initial search with {self.client_ip} and {self.client_visitorid}")
446
444
  track_metrics = get_track_metrics(self.client_ip, self.client_visitorid)
447
- print(f"Sending track metrics: {track_metrics}")
448
445
 
449
446
  def open_and_send():
450
447
  md5_hash = hashlib.md5()
@@ -486,7 +483,7 @@ class _RestClient:
486
483
  api_path, files, trace_id=trace_id, additional_headers=additional_headers
487
484
  )
488
485
 
489
- response = self._with_unauth_retry(lambda: open_and_send())
486
+ response = self._with_unauth_retry(open_and_send)
490
487
  return SearchTaskResponse(response)
491
488
 
492
489
  def check_uploaded_file_v2(self, trace_id: str, file_upload_id: str, metadata: FileMetadata) -> bool:
@@ -571,7 +568,7 @@ class _RestClient:
571
568
  api_path, files, trace_id=trace_id, additional_headers=additional_headers
572
569
  )
573
570
 
574
- response = self._with_unauth_retry(lambda: open_and_send())
571
+ response = self._with_unauth_retry(open_and_send)
575
572
  return SearchTaskResponse(response)
576
573
 
577
574
  def validation_search_without_upload_v2(
@@ -912,8 +909,12 @@ def resolve_api_token(api_token: Optional[str]) -> str:
912
909
  return DEMO_API_KEY
913
910
 
914
911
 
915
- def get_rest_client(backend_url: Optional[str] = None, api_token: Optional[str] = None,
916
- client_ip: Optional[str] = None, client_visitorid: Optional[str] = None) -> _RestClient:
912
+ def get_rest_client(
913
+ backend_url: Optional[str] = None,
914
+ api_token: Optional[str] = None,
915
+ client_ip: Optional[str] = None,
916
+ client_visitorid: Optional[str] = None,
917
+ ) -> _RestClient:
917
918
  url = _resolve_backend_url(backend_url)
918
919
  token = resolve_api_token(api_token)
919
920
 
@@ -925,15 +926,21 @@ def is_demo_api_key(api_token: Optional[str]) -> bool:
925
926
 
926
927
 
927
928
  @lru_cache()
928
- def _get_rest_client(backend_url: str, api_token: str,
929
- client_ip: Optional[str] = None, client_visitorid: Optional[str] = None) -> _RestClient:
929
+ def _get_rest_client(
930
+ backend_url: str, api_token: str, client_ip: Optional[str] = None, client_visitorid: Optional[str] = None
931
+ ) -> _RestClient:
930
932
  return _RestClient(backend_url, api_token, client_ip, client_visitorid)
931
933
 
932
934
 
933
935
  class BackendLogHandler(logging.Handler):
934
- def __init__(self, rest_client: _RestClient,
935
- client_ip: Optional[str] = None, client_visitorid: Optional[str] = None,
936
- *args, **kwargs) -> None:
936
+ def __init__(
937
+ self,
938
+ rest_client: _RestClient,
939
+ client_ip: Optional[str] = None,
940
+ client_visitorid: Optional[str] = None,
941
+ *args,
942
+ **kwargs,
943
+ ) -> None:
937
944
  super().__init__(*args, **kwargs)
938
945
  self.rest_client = rest_client
939
946
  self.track_metrics = None
@@ -987,8 +994,11 @@ class LoggerFactory:
987
994
  root.handlers.clear()
988
995
 
989
996
  def get_logger(
990
- self, backend_url: Optional[str] = None, api_token: Optional[str] = None,
991
- client_ip: Optional[str] = None, client_visitorid: Optional[str] = None
997
+ self,
998
+ backend_url: Optional[str] = None,
999
+ api_token: Optional[str] = None,
1000
+ client_ip: Optional[str] = None,
1001
+ client_visitorid: Optional[str] = None,
992
1002
  ) -> logging.Logger:
993
1003
  url = _resolve_backend_url(backend_url)
994
1004
  token = resolve_api_token(api_token)
@@ -3,8 +3,7 @@
3
3
  .. module: mdc
4
4
  .. moduleauthor:: Aljosha Friemann a.friemann@automate.wtf
5
5
  """
6
- from __future__ import (absolute_import, division, print_function,
7
- unicode_literals)
6
+ from __future__ import absolute_import, division, print_function, unicode_literals
8
7
 
9
8
  import logging
10
9
 
@@ -32,9 +32,7 @@ def get_mdc_fields():
32
32
 
33
33
  @contextmanager
34
34
  def new_log_context(**kwargs):
35
- context_id = "mdc-{thread}-{context}".format(
36
- thread=threading.current_thread().ident, context=uuid.uuid4()
37
- )
35
+ context_id = "mdc-{thread}-{context}".format(thread=threading.current_thread().ident, context=uuid.uuid4())
38
36
 
39
37
  LOGGER.debug("creating context %s", context_id)
40
38
 
@@ -48,11 +46,9 @@ def new_log_context(**kwargs):
48
46
  setattr(context, key, value)
49
47
 
50
48
  try:
51
-
52
49
  yield context
53
50
 
54
51
  finally:
55
-
56
52
  LOGGER.debug("deleting context %s", context_id)
57
53
 
58
54
  try:
@@ -7,7 +7,6 @@ from upgini.errors import ValidationError
7
7
 
8
8
 
9
9
  class PhoneNormalizer:
10
-
11
10
  def __init__(self, df: pd.DataFrame, phone_column_name: str, country_column_name: Optional[str] = None):
12
11
  self.df = df
13
12
  self.phone_column_name = phone_column_name
@@ -78,7 +77,7 @@ class PhoneNormalizer:
78
77
  try:
79
78
  value = str(value)
80
79
  if value.endswith(".0"):
81
- value = value[:len(value) - 2]
80
+ value = value[: len(value) - 2]
82
81
  numeric_filter = filter(str.isdigit, value)
83
82
  numeric_string = "".join(numeric_filter)
84
83
  return PhoneNormalizer.validate_length(int(numeric_string))
@@ -337,5 +336,5 @@ class PhoneNormalizer:
337
336
  "PF": ("689", 7),
338
337
  "TK": ("690", 7),
339
338
  "FM": ("691", 7),
340
- "MH": ("692", 7)
341
- }
339
+ "MH": ("692", 7),
340
+ }
@@ -9,7 +9,6 @@ class MalformedResourceBundleError(ResourceBundleError):
9
9
 
10
10
 
11
11
  class NotInResourceBundleError(ResourceBundleError):
12
-
13
12
  def __init__(self, bundle_name: str, key: str):
14
13
  """
15
14
  Error that is raised when a key could not be found in a ResourceBundle.
@@ -144,6 +144,7 @@ dataset_empty_column_names=Some column names are empty. Add names please
144
144
  dataset_too_long_column_name=Column {} is too long: {} characters. Remove this column or trim length to 50 characters
145
145
  dataset_full_duplicates=\nWARNING: {:.5f}% of the rows are fully duplicated
146
146
  dataset_diff_target_duplicates=\nWARNING: {:.4f}% of rows ({}) in X and eval_set are duplicates with different y values. These rows will be deleted as incorrect\nIncorrect row indexes: {}
147
+ dataset_diff_target_duplicates_fintech=\nWARNING: {:.4f}% of rows ({}) in X and eval_set are duplicates, not taking into consideration dates, IP addresses and features from the training set, but have different y values. These rows have been removed to optimize search results.\nRemoved row indexes: {}
147
148
  dataset_drop_old_dates=\nWARNING: We don't have data before '2000-01-01' and removed all earlier records from the search dataset
148
149
  dataset_all_dates_old=There is empty train dataset after removing data before '2000-01-01'
149
150
  dataset_invalid_target_type=Unexpected dtype of target for binary task type: {}. Expected int or bool
@@ -47,9 +47,7 @@ class SamplerMixin(BaseEstimator, metaclass=ABCMeta):
47
47
  Return the instance itself.
48
48
  """
49
49
  X, y, _ = self._check_X_y(X, y)
50
- self.sampling_strategy_ = check_sampling_strategy(
51
- self.sampling_strategy, y, self._sampling_type
52
- )
50
+ self.sampling_strategy_ = check_sampling_strategy(self.sampling_strategy, y, self._sampling_type)
53
51
  return self
54
52
 
55
53
  def fit_resample(self, X, y):
@@ -77,15 +75,11 @@ class SamplerMixin(BaseEstimator, metaclass=ABCMeta):
77
75
  arrays_transformer = ArraysTransformer(X, y)
78
76
  X, y, binarize_y = self._check_X_y(X, y)
79
77
 
80
- self.sampling_strategy_ = check_sampling_strategy(
81
- self.sampling_strategy, y, self._sampling_type
82
- )
78
+ self.sampling_strategy_ = check_sampling_strategy(self.sampling_strategy, y, self._sampling_type)
83
79
 
84
80
  output = self._fit_resample(X, y)
85
81
 
86
- y_ = (
87
- label_binarize(output[1], classes=np.unique(y)) if binarize_y else output[1]
88
- )
82
+ y_ = label_binarize(output[1], classes=np.unique(y)) if binarize_y else output[1]
89
83
 
90
84
  X_, y_ = arrays_transformer.transform(output[0], y_)
91
85
  return (X_, y_) if len(output) == 2 else (X_, y_, output[2])
@@ -76,9 +76,7 @@ RandomUnderSampler # doctest: +NORMALIZE_WHITESPACE
76
76
  """
77
77
 
78
78
  @_deprecate_positional_args
79
- def __init__(
80
- self, *, sampling_strategy="auto", random_state=None, replacement=False
81
- ):
79
+ def __init__(self, *, sampling_strategy="auto", random_state=None, replacement=False):
82
80
  super().__init__(sampling_strategy=sampling_strategy)
83
81
  self.random_state = random_state
84
82
  self.replacement = replacement
@@ -79,16 +79,12 @@ class SearchTask:
79
79
  with Spinner():
80
80
  if self.PROTECT_FROM_RATE_LIMIT:
81
81
  time.sleep(1) # this is neccesary to avoid requests rate limit restrictions
82
- self.summary = self.rest_client.search_task_summary_v2(
83
- trace_id, search_task_id
84
- )
82
+ self.summary = self.rest_client.search_task_summary_v2(trace_id, search_task_id)
85
83
  while self.summary.status not in completed_statuses and (
86
84
  not check_fit or "VALIDATION" not in self.summary.status
87
85
  ):
88
86
  time.sleep(self.POLLING_DELAY_SECONDS)
89
- self.summary = self.rest_client.search_task_summary_v2(
90
- trace_id, search_task_id
91
- )
87
+ self.summary = self.rest_client.search_task_summary_v2(trace_id, search_task_id)
92
88
  if self.summary.status in failed_statuses:
93
89
  self.logger.error(f"Search {search_task_id} failed with status {self.summary.status}")
94
90
  raise RuntimeError(bundle.get("search_task_failed_status"))
@@ -130,9 +126,7 @@ class SearchTask:
130
126
  for provider_summary in self.summary.initial_important_providers:
131
127
  if provider_summary.status == "COMPLETED":
132
128
  self.provider_metadata_v2.append(
133
- self.rest_client.get_provider_search_metadata_v3(
134
- provider_summary.ads_search_task_id, trace_id
135
- )
129
+ self.rest_client.get_provider_search_metadata_v3(provider_summary.ads_search_task_id, trace_id)
136
130
  )
137
131
  if provider_summary.unused_features_for_generation is not None:
138
132
  self.unused_features_for_generation.extend(provider_summary.unused_features_for_generation)
@@ -271,7 +265,7 @@ class SearchTask:
271
265
  self.rest_client._refresh_token,
272
266
  trace_id,
273
267
  self.search_task_id,
274
- self.PROTECT_FROM_RATE_LIMIT
268
+ self.PROTECT_FROM_RATE_LIMIT,
275
269
  )
276
270
 
277
271
  def get_max_initial_eval_set_hit_rate_v2(self) -> Optional[Dict[int, float]]:
@@ -4,13 +4,7 @@ import time
4
4
 
5
5
 
6
6
  class Spinner:
7
-
8
- DEFAULT_FRAMES = [
9
- "-",
10
- "\\",
11
- "|",
12
- "/"
13
- ]
7
+ DEFAULT_FRAMES = ["-", "\\", "|", "/"]
14
8
 
15
9
  def __init__(self, frames: List[str] = DEFAULT_FRAMES, step_time: float = 0.2):
16
10
  self.stop = False
@@ -22,7 +22,9 @@ class CountrySearchKeyDetector(BaseSearchKeyDetector):
22
22
  return df
23
23
 
24
24
  df[country_column] = (
25
- df[country_column].astype("string").str.upper()
25
+ df[country_column]
26
+ .astype("string")
27
+ .str.upper()
26
28
  .map(CountrySearchKeyDetector.COUNTRIES)
27
29
  .fillna(df[country_column])
28
30
  )
@@ -61,9 +61,22 @@ class DateTimeSearchKeyConverter:
61
61
  elif is_period_dtype(df[self.date_column]):
62
62
  df[self.date_column] = pd.to_datetime(df[self.date_column].astype("string"))
63
63
  elif is_numeric_dtype(df[self.date_column]):
64
- msg = f"Unsupported type of date column {self.date_column}. Convert to datetime please."
65
- self.logger.warning(msg)
66
- raise ValidationError(msg)
64
+ # 315532801 - 2524608001 - seconds
65
+ # 315532801000 - 2524608001000 - milliseconds
66
+ # 315532801000000 - 2524608001000000 - microseconds
67
+ # 315532801000000000 - 2524608001000000000 - nanoseconds
68
+ if df[self.date_column].apply(lambda x: 10**16 < x).all():
69
+ df[self.date_column] = pd.to_datetime(df[self.date_column], unit="ns")
70
+ elif df[self.date_column].apply(lambda x: 10**14 < x < 10**16).all():
71
+ df[self.date_column] = pd.to_datetime(df[self.date_column], unit="us")
72
+ elif df[self.date_column].apply(lambda x: 10**11 < x < 10**14).all():
73
+ df[self.date_column] = pd.to_datetime(df[self.date_column], unit="ms")
74
+ elif df[self.date_column].apply(lambda x: 0 < x < 10*11).all():
75
+ df[self.date_column] = pd.to_datetime(df[self.date_column], unit="s")
76
+ else:
77
+ msg = f"Unsupported type of date column {self.date_column}. Convert to datetime please."
78
+ self.logger.warning(msg)
79
+ raise ValidationError(msg)
67
80
 
68
81
  # If column with date is datetime then extract seconds of the day and minute of the hour
69
82
  # as additional features
@@ -0,0 +1,82 @@
1
+ from logging import Logger
2
+ from typing import Dict, List, Optional, Tuple, Union
3
+
4
+ import pandas as pd
5
+
6
+ from upgini.metadata import TARGET, ModelTaskType, SearchKey
7
+ from upgini.resource_bundle import bundle
8
+ from upgini.utils.datetime_utils import DateTimeSearchKeyConverter
9
+ from upgini.utils.target_utils import define_task
10
+
11
+
12
+ def remove_fintech_duplicates(
13
+ df: pd.DataFrame, search_keys: Dict[str, SearchKey], logger: Optional[Logger] = None, silent=False
14
+ ) -> Tuple(bool, pd.DataFrame):
15
+ # Base checks
16
+ need_full_deduplication = True
17
+
18
+ if define_task(df[TARGET], silent=True) != ModelTaskType.BINARY:
19
+ return need_full_deduplication, df
20
+
21
+ date_col = _get_column_by_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
22
+ if date_col is None:
23
+ return need_full_deduplication, df
24
+
25
+ personal_cols = []
26
+ phone_col = _get_column_by_key(search_keys, SearchKey.PHONE)
27
+ if phone_col:
28
+ personal_cols.append(phone_col)
29
+ email_col = _get_column_by_key(search_keys, SearchKey.EMAIL)
30
+ if email_col:
31
+ personal_cols.append(email_col)
32
+ hem_col = _get_column_by_key(search_keys, SearchKey.HEM)
33
+ if hem_col:
34
+ personal_cols.append(hem_col)
35
+ if len(personal_cols) == 0:
36
+ return need_full_deduplication, df
37
+
38
+ grouped_by_personal_cols = df.groupby(personal_cols, group_keys=False)
39
+
40
+ uniques = grouped_by_personal_cols[date_col].nunique()
41
+ total = len(uniques)
42
+ diff_dates = len(uniques[uniques > 1])
43
+ if diff_dates / total >= 0.6:
44
+ return need_full_deduplication, df
45
+
46
+ # Additional checks
47
+
48
+ need_full_deduplication = False
49
+
50
+ duplicates = df.duplicated(personal_cols, keep=False)
51
+ duplicate_rows = df[duplicates]
52
+ if len(duplicate_rows) == 0:
53
+ return need_full_deduplication, df
54
+
55
+ if grouped_by_personal_cols[TARGET].apply(lambda x: len(x.unique()) == 1).all():
56
+ return need_full_deduplication, df
57
+
58
+ def has_diff_target_within_60_days(rows):
59
+ rows = rows.sort_values(by=date_col)
60
+ return len(rows[rows[TARGET].ne(rows[TARGET].shift()) & (rows[date_col].diff() < 60 * 24 * 60 * 60 * 1000)]) > 0
61
+
62
+ df = DateTimeSearchKeyConverter(date_col).convert(df)
63
+ grouped_by_personal_cols = df.groupby(personal_cols, group_keys=False)
64
+ rows_with_diff_target = grouped_by_personal_cols.filter(has_diff_target_within_60_days)
65
+ if len(rows_with_diff_target) > 0:
66
+ perc = len(rows_with_diff_target) * 100 / len(df)
67
+ msg = bundle.get("dataset_diff_target_duplicates_fintech").format(
68
+ perc, len(rows_with_diff_target), rows_with_diff_target.index.to_list()
69
+ )
70
+ if not silent:
71
+ print(msg)
72
+ if logger:
73
+ logger.warning(msg)
74
+ df = df[~df.index.isin(rows_with_diff_target.index)]
75
+
76
+ return need_full_deduplication, df
77
+
78
+
79
+ def _get_column_by_key(search_keys: Dict[str, SearchKey], keys: Union[SearchKey, List[SearchKey]]) -> Optional[str]:
80
+ for col, key_type in search_keys.items():
81
+ if (isinstance(keys, list) and key_type in keys) or key_type == keys:
82
+ return col
@@ -29,7 +29,6 @@ class EmailSearchKeyDetector(BaseSearchKeyDetector):
29
29
 
30
30
 
31
31
  class EmailSearchKeyConverter:
32
-
33
32
  HEM_COLUMN_NAME = "hashed_email"
34
33
  DOMAIN_COLUMN_NAME = "email_domain"
35
34
  EMAIL_ONE_DOMAIN_COLUMN_NAME = "email_one_domain"
@@ -2,8 +2,8 @@ from typing import Tuple
2
2
 
3
3
 
4
4
  class CustomFallbackProgressBar:
5
- """Progressbar supports displaying a progressbar like element
6
- """
5
+ """Progressbar supports displaying a progressbar like element"""
6
+
7
7
  def __init__(self, total=100):
8
8
  """Creates a new progressbar
9
9
 
@@ -20,12 +20,9 @@ class CustomFallbackProgressBar:
20
20
 
21
21
  def __repr__(self):
22
22
  fraction = self.progress / self.total
23
- filled = '=' * int(fraction * self.text_width)
24
- rest = ' ' * (self.text_width - len(filled))
25
- return '[{}{}] {}% {} {}'.format(
26
- filled, rest,
27
- self.progress, self._stage, self._eta
28
- )
23
+ filled = "=" * int(fraction * self.text_width)
24
+ rest = " " * (self.text_width - len(filled))
25
+ return "[{}{}] {}% {} {}".format(filled, rest, self.progress, self._stage, self._eta)
29
26
 
30
27
  def display(self):
31
28
  print(self)
@@ -1,5 +1,4 @@
1
1
  class WarningCounter:
2
-
3
2
  def __init__(self):
4
3
  self._count = 0
5
4
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: upgini
3
- Version: 1.1.242a3
3
+ Version: 1.1.244a1
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Home-page: https://upgini.com/
6
6
  Author: Upgini Developers
@@ -49,6 +49,7 @@ src/upgini/utils/country_utils.py
49
49
  src/upgini/utils/custom_loss_utils.py
50
50
  src/upgini/utils/cv_utils.py
51
51
  src/upgini/utils/datetime_utils.py
52
+ src/upgini/utils/deduplicate_utils.py
52
53
  src/upgini/utils/display_utils.py
53
54
  src/upgini/utils/email_utils.py
54
55
  src/upgini/utils/fallback_progress_bar.py
@@ -127,7 +127,8 @@ def test_multivariate_timeseries_detection():
127
127
 
128
128
 
129
129
  def test_multivariate_time_series():
130
- df = pd.DataFrame({
130
+ df = pd.DataFrame(
131
+ {
131
132
  "date": [
132
133
  "2020-01-01 00:00:00",
133
134
  "2020-01-01 00:00:02",
@@ -135,44 +136,49 @@ def test_multivariate_time_series():
135
136
  "2020-01-01 00:00:06",
136
137
  "2020-01-01 00:00:08",
137
138
  ]
138
- })
139
+ }
140
+ )
139
141
  assert not is_blocked_time_series(df, "date", ["date"])
140
142
 
141
- df = pd.DataFrame({
142
- "date": pd.date_range("2020-01-01", "2020-02-01")
143
- })
143
+ df = pd.DataFrame({"date": pd.date_range("2020-01-01", "2020-02-01")})
144
144
  assert not is_blocked_time_series(df, "date", ["date"])
145
145
 
146
- df = pd.DataFrame({
147
- "date": pd.date_range("2020-01-01", "2021-01-01")
148
- })
146
+ df = pd.DataFrame({"date": pd.date_range("2020-01-01", "2021-01-01")})
149
147
  assert is_blocked_time_series(df, "date", ["date"])
150
148
 
151
- df1 = pd.DataFrame({
152
- "date": pd.date_range("2020-01-01", "2021-01-01"),
153
- "feature1": np.random.randint(0, 1000, 367),
154
- "feature2": np.random.randint(0, 1000, 367)
155
- })
156
- df2 = pd.DataFrame({
157
- "date": pd.date_range("2020-01-01", "2021-01-01"),
158
- "feature1": np.random.randint(0, 1000, 367),
159
- "feature2": np.random.randint(0, 1000, 367)
160
- })
149
+ df1 = pd.DataFrame(
150
+ {
151
+ "date": pd.date_range("2020-01-01", "2021-01-01"),
152
+ "feature1": np.random.randint(0, 1000, 367),
153
+ "feature2": np.random.randint(0, 1000, 367),
154
+ }
155
+ )
156
+ df2 = pd.DataFrame(
157
+ {
158
+ "date": pd.date_range("2020-01-01", "2021-01-01"),
159
+ "feature1": np.random.randint(0, 1000, 367),
160
+ "feature2": np.random.randint(0, 1000, 367),
161
+ }
162
+ )
161
163
  df = pd.concat([df1, df2])
162
164
  assert is_blocked_time_series(df, "date", ["date"])
163
165
 
164
- df1 = pd.DataFrame({
165
- "date": pd.date_range("2020-01-01", "2021-01-01"),
166
- "feature1": np.random.randint(0, 1000, 367),
167
- "feature2": np.random.randint(0, 1000, 367),
168
- "feature3": np.random.randint(0, 1000, 367),
169
- })
170
- df2 = pd.DataFrame({
171
- "date": pd.date_range("2020-01-01", "2021-01-01"),
172
- "feature1": np.random.randint(0, 1000, 367),
173
- "feature2": np.random.randint(0, 1000, 367),
174
- "feature3": np.random.randint(0, 1000, 367),
175
- })
166
+ df1 = pd.DataFrame(
167
+ {
168
+ "date": pd.date_range("2020-01-01", "2021-01-01"),
169
+ "feature1": np.random.randint(0, 1000, 367),
170
+ "feature2": np.random.randint(0, 1000, 367),
171
+ "feature3": np.random.randint(0, 1000, 367),
172
+ }
173
+ )
174
+ df2 = pd.DataFrame(
175
+ {
176
+ "date": pd.date_range("2020-01-01", "2021-01-01"),
177
+ "feature1": np.random.randint(0, 1000, 367),
178
+ "feature2": np.random.randint(0, 1000, 367),
179
+ "feature3": np.random.randint(0, 1000, 367),
180
+ }
181
+ )
176
182
  df = pd.concat([df1, df2])
177
183
  assert not is_blocked_time_series(df, "date", ["date"])
178
184
 
@@ -58,7 +58,7 @@ def test_convertion_to_hem():
58
58
  None,
59
59
  None,
60
60
  None,
61
- None
61
+ None,
62
62
  ],
63
63
  EmailSearchKeyConverter.EMAIL_ONE_DOMAIN_COLUMN_NAME: ["tgoogle.com", None, None, None, None, None],
64
64
  EmailSearchKeyConverter.DOMAIN_COLUMN_NAME: ["google.com", None, None, None, None, None],
@@ -58,9 +58,14 @@ def test_string_ip_to_int_conversion():
58
58
  {"ip": None},
59
59
  ]
60
60
  )
61
- dataset = Dataset("test", df=df, search_keys=[("ip", )], meaning_types={
62
- "ip": FileColumnMeaningType.IP_ADDRESS,
63
- })
61
+ dataset = Dataset(
62
+ "test",
63
+ df=df,
64
+ search_keys=[("ip",)],
65
+ meaning_types={
66
+ "ip": FileColumnMeaningType.IP_ADDRESS,
67
+ },
68
+ )
64
69
  dataset._Dataset__rename_columns()
65
70
  dataset._Dataset__convert_ip()
66
71
  assert dataset.data["ip_bb9af5_v4"].dtype == "Int64"
@@ -77,7 +82,7 @@ def test_python_ip_to_int_conversion():
77
82
  {"ip": ipaddress.ip_address("192.168.1.1")},
78
83
  ]
79
84
  )
80
- dataset = Dataset("test", df=df, search_keys=[("ip", )])
85
+ dataset = Dataset("test", df=df, search_keys=[("ip",)])
81
86
  dataset.meaning_types = {
82
87
  "ip": FileColumnMeaningType.IP_ADDRESS,
83
88
  }
@@ -91,7 +96,7 @@ def test_python_ip_to_int_conversion():
91
96
 
92
97
  def test_ip_v6_conversion():
93
98
  df = pd.DataFrame({"ip": ["::cf:befe:525b"]})
94
- dataset = Dataset("test", df=df, search_keys=[("ip", )])
99
+ dataset = Dataset("test", df=df, search_keys=[("ip",)])
95
100
  dataset.meaning_types = {
96
101
  "ip": FileColumnMeaningType.IP_ADDRESS,
97
102
  }
@@ -107,7 +112,7 @@ def test_int_ip_to_int_conversion():
107
112
  df = pd.DataFrame(
108
113
  {"ip": [3232235777, 892262568539]},
109
114
  )
110
- dataset = Dataset("test", df=df, search_keys=[("ip", )]) # type: ignore
115
+ dataset = Dataset("test", df=df, search_keys=[("ip",)]) # type: ignore
111
116
  dataset.meaning_types = {
112
117
  "ip": FileColumnMeaningType.IP_ADDRESS,
113
118
  }
@@ -615,9 +620,7 @@ def test_columns_renaming():
615
620
 
616
621
  df = pd.concat([df1, df2], axis=1)
617
622
 
618
- dataset = Dataset(
619
- "tds", df=df, meaning_types={"date": FileColumnMeaningType.DATE}, search_keys=[("date",)]
620
- )
623
+ dataset = Dataset("tds", df=df, meaning_types={"date": FileColumnMeaningType.DATE}, search_keys=[("date",)])
621
624
  dataset._Dataset__rename_columns()
622
625
  print(dataset)
623
626
  assert set(dataset.data.columns.to_list()) == {"feature1_422b73", "date_0e8763", "feature1_422b73_0"}
@@ -632,9 +635,7 @@ def test_too_long_columns():
632
635
  }
633
636
  )
634
637
 
635
- dataset = Dataset(
636
- "tds", df=df, meaning_types={"date": FileColumnMeaningType.DATE}, search_keys=[("date",)]
637
- )
638
+ dataset = Dataset("tds", df=df, meaning_types={"date": FileColumnMeaningType.DATE}, search_keys=[("date",)])
638
639
  dataset._Dataset__rename_columns()
639
640
  print(dataset)
640
641
  assert set(dataset.data.columns.to_list()) == {
@@ -417,7 +417,7 @@ def test_widget(requests_mock: Mocker):
417
417
  '<button kind="secondary"><p>Instant purchase</p></button></a></div>'
418
418
  ),
419
419
  (
420
- "<div class=\"stButton\"><a href='https://app.snowflake.com/marketplace/listing/GZSTZ3VDMF6/"
420
+ '<div class="stButton"><a href=\'https://app.snowflake.com/marketplace/listing/GZSTZ3VDMF6/'
421
421
  "?referer=upgini' target='_blank' rel='noopener noreferrer'><button kind=\"secondary\"><p>"
422
422
  "Instant purchase</p></button></a></div>"
423
423
  ),
File without changes
File without changes
File without changes
File without changes
File without changes
@@ -5,12 +5,12 @@ import pandas as pd
5
5
  from requests import get
6
6
 
7
7
  from upgini.metadata import SearchKey
8
+
8
9
  # from upgini.resource_bundle import bundle
9
10
  # from upgini.utils.track_info import get_track_metrics
10
11
 
11
12
 
12
13
  class IpToCountrySearchKeyConverter:
13
-
14
14
  url = "http://ip-api.com/json/{}"
15
15
 
16
16
  def __init__(