upgini 1.1.236a2__tar.gz → 1.1.237a2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (81) hide show
  1. {upgini-1.1.236a2/src/upgini.egg-info → upgini-1.1.237a2}/PKG-INFO +1 -1
  2. {upgini-1.1.236a2 → upgini-1.1.237a2}/setup.py +1 -1
  3. {upgini-1.1.236a2 → upgini-1.1.237a2}/src/upgini/dataset.py +10 -1
  4. {upgini-1.1.236a2 → upgini-1.1.237a2}/src/upgini/features_enricher.py +17 -8
  5. {upgini-1.1.236a2 → upgini-1.1.237a2}/src/upgini/resource_bundle/strings.properties +1 -0
  6. {upgini-1.1.236a2 → upgini-1.1.237a2}/src/upgini/utils/datetime_utils.py +16 -3
  7. upgini-1.1.237a2/src/upgini/utils/deduplicate_utils.py +72 -0
  8. {upgini-1.1.236a2 → upgini-1.1.237a2/src/upgini.egg-info}/PKG-INFO +1 -1
  9. {upgini-1.1.236a2 → upgini-1.1.237a2}/src/upgini.egg-info/SOURCES.txt +1 -0
  10. {upgini-1.1.236a2 → upgini-1.1.237a2}/LICENSE +0 -0
  11. {upgini-1.1.236a2 → upgini-1.1.237a2}/README.md +0 -0
  12. {upgini-1.1.236a2 → upgini-1.1.237a2}/pyproject.toml +0 -0
  13. {upgini-1.1.236a2 → upgini-1.1.237a2}/setup.cfg +0 -0
  14. {upgini-1.1.236a2 → upgini-1.1.237a2}/src/upgini/__init__.py +0 -0
  15. {upgini-1.1.236a2 → upgini-1.1.237a2}/src/upgini/ads.py +0 -0
  16. {upgini-1.1.236a2 → upgini-1.1.237a2}/src/upgini/ads_management/__init__.py +0 -0
  17. {upgini-1.1.236a2 → upgini-1.1.237a2}/src/upgini/ads_management/ads_manager.py +0 -0
  18. {upgini-1.1.236a2 → upgini-1.1.237a2}/src/upgini/autofe/__init__.py +0 -0
  19. {upgini-1.1.236a2 → upgini-1.1.237a2}/src/upgini/autofe/all_operands.py +0 -0
  20. {upgini-1.1.236a2 → upgini-1.1.237a2}/src/upgini/autofe/binary.py +0 -0
  21. {upgini-1.1.236a2 → upgini-1.1.237a2}/src/upgini/autofe/feature.py +0 -0
  22. {upgini-1.1.236a2 → upgini-1.1.237a2}/src/upgini/autofe/groupby.py +0 -0
  23. {upgini-1.1.236a2 → upgini-1.1.237a2}/src/upgini/autofe/operand.py +0 -0
  24. {upgini-1.1.236a2 → upgini-1.1.237a2}/src/upgini/autofe/unary.py +0 -0
  25. {upgini-1.1.236a2 → upgini-1.1.237a2}/src/upgini/autofe/vector.py +0 -0
  26. {upgini-1.1.236a2 → upgini-1.1.237a2}/src/upgini/data_source/__init__.py +0 -0
  27. {upgini-1.1.236a2 → upgini-1.1.237a2}/src/upgini/data_source/data_source_publisher.py +0 -0
  28. {upgini-1.1.236a2 → upgini-1.1.237a2}/src/upgini/errors.py +0 -0
  29. {upgini-1.1.236a2 → upgini-1.1.237a2}/src/upgini/fingerprint.js +0 -0
  30. {upgini-1.1.236a2 → upgini-1.1.237a2}/src/upgini/http.py +0 -0
  31. {upgini-1.1.236a2 → upgini-1.1.237a2}/src/upgini/mdc/__init__.py +0 -0
  32. {upgini-1.1.236a2 → upgini-1.1.237a2}/src/upgini/mdc/context.py +0 -0
  33. {upgini-1.1.236a2 → upgini-1.1.237a2}/src/upgini/metadata.py +0 -0
  34. {upgini-1.1.236a2 → upgini-1.1.237a2}/src/upgini/metrics.py +0 -0
  35. {upgini-1.1.236a2 → upgini-1.1.237a2}/src/upgini/normalizer/__init__.py +0 -0
  36. {upgini-1.1.236a2 → upgini-1.1.237a2}/src/upgini/normalizer/phone_normalizer.py +0 -0
  37. {upgini-1.1.236a2 → upgini-1.1.237a2}/src/upgini/resource_bundle/__init__.py +0 -0
  38. {upgini-1.1.236a2 → upgini-1.1.237a2}/src/upgini/resource_bundle/exceptions.py +0 -0
  39. {upgini-1.1.236a2 → upgini-1.1.237a2}/src/upgini/sampler/__init__.py +0 -0
  40. {upgini-1.1.236a2 → upgini-1.1.237a2}/src/upgini/sampler/base.py +0 -0
  41. {upgini-1.1.236a2 → upgini-1.1.237a2}/src/upgini/sampler/random_under_sampler.py +0 -0
  42. {upgini-1.1.236a2 → upgini-1.1.237a2}/src/upgini/sampler/utils.py +0 -0
  43. {upgini-1.1.236a2 → upgini-1.1.237a2}/src/upgini/search_task.py +0 -0
  44. {upgini-1.1.236a2 → upgini-1.1.237a2}/src/upgini/spinner.py +0 -0
  45. {upgini-1.1.236a2 → upgini-1.1.237a2}/src/upgini/utils/__init__.py +0 -0
  46. {upgini-1.1.236a2 → upgini-1.1.237a2}/src/upgini/utils/base_search_key_detector.py +0 -0
  47. {upgini-1.1.236a2 → upgini-1.1.237a2}/src/upgini/utils/blocked_time_series.py +0 -0
  48. {upgini-1.1.236a2 → upgini-1.1.237a2}/src/upgini/utils/country_utils.py +0 -0
  49. {upgini-1.1.236a2 → upgini-1.1.237a2}/src/upgini/utils/custom_loss_utils.py +0 -0
  50. {upgini-1.1.236a2 → upgini-1.1.237a2}/src/upgini/utils/cv_utils.py +0 -0
  51. {upgini-1.1.236a2 → upgini-1.1.237a2}/src/upgini/utils/display_utils.py +0 -0
  52. {upgini-1.1.236a2 → upgini-1.1.237a2}/src/upgini/utils/email_utils.py +0 -0
  53. {upgini-1.1.236a2 → upgini-1.1.237a2}/src/upgini/utils/fallback_progress_bar.py +0 -0
  54. {upgini-1.1.236a2 → upgini-1.1.237a2}/src/upgini/utils/features_validator.py +0 -0
  55. {upgini-1.1.236a2 → upgini-1.1.237a2}/src/upgini/utils/format.py +0 -0
  56. {upgini-1.1.236a2 → upgini-1.1.237a2}/src/upgini/utils/ip_utils.py +0 -0
  57. {upgini-1.1.236a2 → upgini-1.1.237a2}/src/upgini/utils/phone_utils.py +0 -0
  58. {upgini-1.1.236a2 → upgini-1.1.237a2}/src/upgini/utils/postal_code_utils.py +0 -0
  59. {upgini-1.1.236a2 → upgini-1.1.237a2}/src/upgini/utils/progress_bar.py +0 -0
  60. {upgini-1.1.236a2 → upgini-1.1.237a2}/src/upgini/utils/sklearn_ext.py +0 -0
  61. {upgini-1.1.236a2 → upgini-1.1.237a2}/src/upgini/utils/target_utils.py +0 -0
  62. {upgini-1.1.236a2 → upgini-1.1.237a2}/src/upgini/utils/track_info.py +0 -0
  63. {upgini-1.1.236a2 → upgini-1.1.237a2}/src/upgini/utils/warning_counter.py +0 -0
  64. {upgini-1.1.236a2 → upgini-1.1.237a2}/src/upgini/version_validator.py +0 -0
  65. {upgini-1.1.236a2 → upgini-1.1.237a2}/src/upgini.egg-info/dependency_links.txt +0 -0
  66. {upgini-1.1.236a2 → upgini-1.1.237a2}/src/upgini.egg-info/requires.txt +0 -0
  67. {upgini-1.1.236a2 → upgini-1.1.237a2}/src/upgini.egg-info/top_level.txt +0 -0
  68. {upgini-1.1.236a2 → upgini-1.1.237a2}/tests/test_binary_dataset.py +0 -0
  69. {upgini-1.1.236a2 → upgini-1.1.237a2}/tests/test_blocked_time_series.py +0 -0
  70. {upgini-1.1.236a2 → upgini-1.1.237a2}/tests/test_categorical_dataset.py +0 -0
  71. {upgini-1.1.236a2 → upgini-1.1.237a2}/tests/test_continuous_dataset.py +0 -0
  72. {upgini-1.1.236a2 → upgini-1.1.237a2}/tests/test_country_utils.py +0 -0
  73. {upgini-1.1.236a2 → upgini-1.1.237a2}/tests/test_custom_loss_utils.py +0 -0
  74. {upgini-1.1.236a2 → upgini-1.1.237a2}/tests/test_datetime_utils.py +0 -0
  75. {upgini-1.1.236a2 → upgini-1.1.237a2}/tests/test_email_utils.py +0 -0
  76. {upgini-1.1.236a2 → upgini-1.1.237a2}/tests/test_etalon_validation.py +0 -0
  77. {upgini-1.1.236a2 → upgini-1.1.237a2}/tests/test_features_enricher.py +0 -0
  78. {upgini-1.1.236a2 → upgini-1.1.237a2}/tests/test_metrics.py +0 -0
  79. {upgini-1.1.236a2 → upgini-1.1.237a2}/tests/test_phone_utils.py +0 -0
  80. {upgini-1.1.236a2 → upgini-1.1.237a2}/tests/test_postal_code_utils.py +0 -0
  81. {upgini-1.1.236a2 → upgini-1.1.237a2}/tests/test_widget.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: upgini
3
- Version: 1.1.236a2
3
+ Version: 1.1.237a2
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Home-page: https://upgini.com/
6
6
  Author: Upgini Developers
@@ -40,7 +40,7 @@ def send_log(msg: str):
40
40
 
41
41
 
42
42
  here = Path(__file__).parent.resolve()
43
- version = "1.1.236a2"
43
+ version = "1.1.237a2"
44
44
  try:
45
45
  send_log(f"Start setup PyLib version {version}")
46
46
  setup(
@@ -36,12 +36,14 @@ from upgini.metadata import (
36
36
  NumericInterval,
37
37
  RuntimeParameters,
38
38
  SearchCustomization,
39
+ SearchKey,
39
40
  )
40
41
  from upgini.normalizer.phone_normalizer import PhoneNormalizer
41
42
  from upgini.resource_bundle import bundle
42
43
  from upgini.sampler.random_under_sampler import RandomUnderSampler
43
44
  from upgini.search_task import SearchTask
44
45
  from upgini.utils import combine_search_keys
46
+ from upgini.utils.deduplicate_utils import remove_fintech_duplicates
45
47
  from upgini.utils.email_utils import EmailSearchKeyConverter
46
48
 
47
49
  try:
@@ -382,7 +384,7 @@ class Dataset: # (pd.DataFrame):
382
384
 
383
385
  if is_string_dtype(self.data[postal_code]):
384
386
  try:
385
- self.data[postal_code] = self.data[postal_code].astype("Float64").astype("Int64").astype("string")
387
+ self.data[postal_code] = self.data[postal_code].astype("float64").astype("Int64").astype("string")
386
388
  except Exception:
387
389
  pass
388
390
  elif is_float_dtype(self.data[postal_code]):
@@ -820,6 +822,13 @@ class Dataset: # (pd.DataFrame):
820
822
  self.__validate_dataset(validate_target, silent_mode)
821
823
 
822
824
  if validate_target:
825
+ search_keys = {
826
+ col: SearchKey.from_meaning_type(key_type)
827
+ for col, key_type in self.meaning_types.items()
828
+ if SearchKey.from_meaning_type(key_type) is not None
829
+ }
830
+ self.data = remove_fintech_duplicates(self.data, search_keys, self.logger)
831
+
823
832
  self.__validate_target()
824
833
 
825
834
  self.__resample()
@@ -64,6 +64,7 @@ from upgini.utils.datetime_utils import (
64
64
  is_blocked_time_series,
65
65
  is_time_series,
66
66
  )
67
+ from upgini.utils.deduplicate_utils import remove_fintech_duplicates
67
68
  from upgini.utils.display_utils import (
68
69
  display_html_dataframe,
69
70
  do_without_pandas_limits,
@@ -1183,8 +1184,8 @@ class FeaturesEnricher(TransformerMixin):
1183
1184
  converter = DateTimeSearchKeyConverter(date_column, self.date_format, self.logger)
1184
1185
  extended_X = converter.convert(extended_X, keep_time=True)
1185
1186
  generated_features.extend(converter.generated_features)
1186
- email_column = self.__get_email_column(search_keys)
1187
- hem_column = self.__get_hem_column(search_keys)
1187
+ email_column = self._get_email_column(search_keys)
1188
+ hem_column = self._get_hem_column(search_keys)
1188
1189
  if email_column:
1189
1190
  converter = EmailSearchKeyConverter(email_column, hem_column, search_keys, self.logger)
1190
1191
  extended_X = converter.convert(extended_X)
@@ -1505,6 +1506,8 @@ class FeaturesEnricher(TransformerMixin):
1505
1506
  eval_df_with_index[TARGET] = eval_y
1506
1507
  eval_df_with_index[EVAL_SET_INDEX] = idx + 1
1507
1508
  df_with_eval_set_index = pd.concat([df_with_eval_set_index, eval_df_with_index])
1509
+
1510
+ df_with_eval_set_index = remove_fintech_duplicates(df_with_eval_set_index, self.search_keys, self.logger)
1508
1511
 
1509
1512
  # downsample if need to eval_set threshold
1510
1513
  num_samples = _num_samples(df_with_eval_set_index)
@@ -1741,8 +1744,8 @@ class FeaturesEnricher(TransformerMixin):
1741
1744
  generated_features.extend(converter.generated_features)
1742
1745
  else:
1743
1746
  self.logger.info("Input dataset hasn't date column")
1744
- email_column = self.__get_email_column(search_keys)
1745
- hem_column = self.__get_hem_column(search_keys)
1747
+ email_column = self._get_email_column(search_keys)
1748
+ hem_column = self._get_hem_column(search_keys)
1746
1749
  email_converted_to_hem = False
1747
1750
  if email_column:
1748
1751
  converter = EmailSearchKeyConverter(email_column, hem_column, search_keys, self.logger)
@@ -2081,8 +2084,8 @@ class FeaturesEnricher(TransformerMixin):
2081
2084
  self.fit_generated_features.extend(converter.generated_features)
2082
2085
  else:
2083
2086
  self.logger.info("Input dataset hasn't date column")
2084
- email_column = self.__get_email_column(self.fit_search_keys)
2085
- hem_column = self.__get_hem_column(self.fit_search_keys)
2087
+ email_column = self._get_email_column(self.fit_search_keys)
2088
+ hem_column = self._get_hem_column(self.fit_search_keys)
2086
2089
  email_converted_to_hem = False
2087
2090
  if email_column:
2088
2091
  converter = EmailSearchKeyConverter(email_column, hem_column, self.fit_search_keys, self.logger)
@@ -2615,16 +2618,22 @@ class FeaturesEnricher(TransformerMixin):
2615
2618
  return [col for col, t in search_keys.items() if t not in [SearchKey.DATE, SearchKey.DATETIME]]
2616
2619
 
2617
2620
  @staticmethod
2618
- def __get_email_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
2621
+ def _get_email_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
2619
2622
  for col, t in search_keys.items():
2620
2623
  if t == SearchKey.EMAIL:
2621
2624
  return col
2622
2625
 
2623
2626
  @staticmethod
2624
- def __get_hem_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
2627
+ def _get_hem_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
2625
2628
  for col, t in search_keys.items():
2626
2629
  if t == SearchKey.HEM:
2627
2630
  return col
2631
+
2632
+ @staticmethod
2633
+ def _get_phone_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
2634
+ for col, t in search_keys.items():
2635
+ if t == SearchKey.PHONE:
2636
+ return col
2628
2637
 
2629
2638
  def __add_fit_system_record_id(
2630
2639
  self, df: pd.DataFrame, meaning_types: Dict[str, FileColumnMeaningType], search_keys: Dict[str, SearchKey]
@@ -142,6 +142,7 @@ dataset_empty_column_names=Some column names are empty. Add names please
142
142
  dataset_too_long_column_name=Column {} is too long: {} characters. Remove this column or trim length to 50 characters
143
143
  dataset_full_duplicates=\nWARNING: {:.5f}% of the rows are fully duplicated
144
144
  dataset_diff_target_duplicates=\nWARNING: {:.4f}% of rows ({}) in X and eval_set are duplicates with different y values. These rows will be deleted as incorrect\nIncorrect row indexes: {}
145
+ dataset_diff_target_duplicates_fintech=\nWARNING: {:.4f}% of rows ({}) in X and eval_set are duplicates, not taking into consideration dates, IP addresses and features from the training set, but have different y values. These rows have been removed to optimize search results.\nRemoved row indexes: {}
145
146
  dataset_drop_old_dates=\nWARNING: We don't have data before '2000-01-01' and removed all earlier records from the search dataset
146
147
  dataset_all_dates_old=There is empty train dataset after removing data before '2000-01-01'
147
148
  dataset_invalid_target_type=Unexpected dtype of target for binary task type: {}. Expected int or bool
@@ -61,9 +61,22 @@ class DateTimeSearchKeyConverter:
61
61
  elif is_period_dtype(df[self.date_column]):
62
62
  df[self.date_column] = pd.to_datetime(df[self.date_column].astype("string"))
63
63
  elif is_numeric_dtype(df[self.date_column]):
64
- msg = f"Unsupported type of date column {self.date_column}. Convert to datetime please."
65
- self.logger.warning(msg)
66
- raise ValidationError(msg)
64
+ # 315532801 - 2524608001 - seconds
65
+ # 315532801000 - 2524608001000 - milliseconds
66
+ # 315532801000000 - 2524608001000000 - microseconds
67
+ # 315532801000000000 - 2524608001000000000 - nanoseconds
68
+ if df[self.date_column].apply(lambda x: 10**16 < x).all():
69
+ df[self.date_column] = pd.to_datetime(df[self.date_column], unit="ns")
70
+ elif df[self.date_column].apply(lambda x: 10**14 < x < 10**16).all():
71
+ df[self.date_column] = pd.to_datetime(df[self.date_column], unit="us")
72
+ elif df[self.date_column].apply(lambda x: 10**11 < x < 10**14).all():
73
+ df[self.date_column] = pd.to_datetime(df[self.date_column], unit="ms")
74
+ elif df[self.date_column].apply(lambda x: 0 < x < 10*11).all():
75
+ df[self.date_column] = pd.to_datetime(df[self.date_column], unit="s")
76
+ else:
77
+ msg = f"Unsupported type of date column {self.date_column}. Convert to datetime please."
78
+ self.logger.warning(msg)
79
+ raise ValidationError(msg)
67
80
 
68
81
  # If column with date is datetime then extract seconds of the day and minute of the hour
69
82
  # as additional features
@@ -0,0 +1,72 @@
1
+ from logging import Logger
2
+ from typing import Dict, List, Optional, Union
3
+
4
+ import pandas as pd
5
+
6
+ from upgini.metadata import TARGET, ModelTaskType, SearchKey
7
+ from upgini.resource_bundle import bundle
8
+ from upgini.utils.datetime_utils import DateTimeSearchKeyConverter
9
+ from upgini.utils.target_utils import define_task
10
+
11
+
12
+ def remove_fintech_duplicates(df: pd.DataFrame,
13
+ search_keys: Dict[str, SearchKey],
14
+ logger: Optional[Logger] = None) -> pd.DataFrame:
15
+ if define_task(df.target, silent=True) != ModelTaskType.BINARY:
16
+ return df
17
+
18
+ date_col = _get_column_by_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
19
+ if date_col is None:
20
+ return df
21
+
22
+ personal_cols = []
23
+ phone_col = _get_column_by_key(search_keys, SearchKey.PHONE)
24
+ if phone_col:
25
+ personal_cols.append(phone_col)
26
+ email_col = _get_column_by_key(search_keys, SearchKey.EMAIL)
27
+ if email_col:
28
+ personal_cols.append(email_col)
29
+ hem_col = _get_column_by_key(search_keys, SearchKey.HEM)
30
+ if hem_col:
31
+ personal_cols.append(hem_col)
32
+ if len(personal_cols) == 0:
33
+ return df
34
+
35
+ duplicates = df.duplicated(personal_cols, keep=False)
36
+ duplicate_rows = df[duplicates]
37
+ if len(duplicate_rows) == 0:
38
+ return df
39
+
40
+ grouped_by_personal_cols = df.groupby(personal_cols, group_keys=False)
41
+
42
+ uniques = grouped_by_personal_cols[date_col].nunique()
43
+ total = len(uniques)
44
+ diff_dates = len(uniques[uniques > 1])
45
+ if diff_dates / total >= 0.6:
46
+ return df
47
+
48
+ if grouped_by_personal_cols[TARGET].apply(lambda x: len(x.unique()) == 1).all():
49
+ return df
50
+
51
+ def has_diff_target_within_60_days(rows):
52
+ rows = rows.sort_values(by=date_col)
53
+ return len(rows[rows[TARGET].ne(rows[TARGET].shift()) & (rows[date_col].diff() < 60 * 24 * 60 * 60 * 1000)]) > 0
54
+
55
+ df = DateTimeSearchKeyConverter(date_col).convert(df)
56
+ grouped_by_personal_cols = df.groupby(personal_cols, group_keys=False)
57
+ rows_with_diff_target = grouped_by_personal_cols.filter(has_diff_target_within_60_days)
58
+ if len(rows_with_diff_target) > 0:
59
+ perc = len(rows_with_diff_target) * 100 / len(df)
60
+ msg = bundle.get("dataset_diff_target_duplicates_fintech").format(perc, len(rows_with_diff_target), rows_with_diff_target.index.to_list())
61
+ print(msg)
62
+ if logger:
63
+ logger.warning(msg)
64
+ df = df[~df.index.isin(rows_with_diff_target.index)]
65
+
66
+ return df
67
+
68
+
69
+ def _get_column_by_key(search_keys: Dict[str, SearchKey], keys: Union[SearchKey, List[SearchKey]]) -> Optional[str]:
70
+ for col, key_type in search_keys.items():
71
+ if (isinstance(keys, list) and key_type in keys) or key_type == keys:
72
+ return col
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: upgini
3
- Version: 1.1.236a2
3
+ Version: 1.1.237a2
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Home-page: https://upgini.com/
6
6
  Author: Upgini Developers
@@ -49,6 +49,7 @@ src/upgini/utils/country_utils.py
49
49
  src/upgini/utils/custom_loss_utils.py
50
50
  src/upgini/utils/cv_utils.py
51
51
  src/upgini/utils/datetime_utils.py
52
+ src/upgini/utils/deduplicate_utils.py
52
53
  src/upgini/utils/display_utils.py
53
54
  src/upgini/utils/email_utils.py
54
55
  src/upgini/utils/fallback_progress_bar.py
File without changes
File without changes
File without changes
File without changes
File without changes