upgini 1.1.264__tar.gz → 1.1.264a1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

Files changed (85) hide show
  1. {upgini-1.1.264/src/upgini.egg-info → upgini-1.1.264a1}/PKG-INFO +1 -1
  2. {upgini-1.1.264 → upgini-1.1.264a1}/setup.py +1 -1
  3. {upgini-1.1.264 → upgini-1.1.264a1}/src/upgini/features_enricher.py +22 -13
  4. {upgini-1.1.264 → upgini-1.1.264a1}/src/upgini/resource_bundle/strings.properties +2 -2
  5. {upgini-1.1.264 → upgini-1.1.264a1}/src/upgini/utils/datetime_utils.py +1 -49
  6. {upgini-1.1.264 → upgini-1.1.264a1}/src/upgini/utils/deduplicate_utils.py +61 -18
  7. {upgini-1.1.264 → upgini-1.1.264a1}/src/upgini/utils/target_utils.py +6 -2
  8. {upgini-1.1.264 → upgini-1.1.264a1/src/upgini.egg-info}/PKG-INFO +1 -1
  9. {upgini-1.1.264 → upgini-1.1.264a1}/tests/test_datetime_utils.py +2 -30
  10. {upgini-1.1.264 → upgini-1.1.264a1}/tests/test_features_enricher.py +0 -2
  11. {upgini-1.1.264 → upgini-1.1.264a1}/LICENSE +0 -0
  12. {upgini-1.1.264 → upgini-1.1.264a1}/README.md +0 -0
  13. {upgini-1.1.264 → upgini-1.1.264a1}/pyproject.toml +0 -0
  14. {upgini-1.1.264 → upgini-1.1.264a1}/setup.cfg +0 -0
  15. {upgini-1.1.264 → upgini-1.1.264a1}/src/upgini/__init__.py +0 -0
  16. {upgini-1.1.264 → upgini-1.1.264a1}/src/upgini/ads.py +0 -0
  17. {upgini-1.1.264 → upgini-1.1.264a1}/src/upgini/ads_management/__init__.py +0 -0
  18. {upgini-1.1.264 → upgini-1.1.264a1}/src/upgini/ads_management/ads_manager.py +0 -0
  19. {upgini-1.1.264 → upgini-1.1.264a1}/src/upgini/autofe/__init__.py +0 -0
  20. {upgini-1.1.264 → upgini-1.1.264a1}/src/upgini/autofe/all_operands.py +0 -0
  21. {upgini-1.1.264 → upgini-1.1.264a1}/src/upgini/autofe/binary.py +0 -0
  22. {upgini-1.1.264 → upgini-1.1.264a1}/src/upgini/autofe/date.py +0 -0
  23. {upgini-1.1.264 → upgini-1.1.264a1}/src/upgini/autofe/feature.py +0 -0
  24. {upgini-1.1.264 → upgini-1.1.264a1}/src/upgini/autofe/groupby.py +0 -0
  25. {upgini-1.1.264 → upgini-1.1.264a1}/src/upgini/autofe/operand.py +0 -0
  26. {upgini-1.1.264 → upgini-1.1.264a1}/src/upgini/autofe/unary.py +0 -0
  27. {upgini-1.1.264 → upgini-1.1.264a1}/src/upgini/autofe/vector.py +0 -0
  28. {upgini-1.1.264 → upgini-1.1.264a1}/src/upgini/data_source/__init__.py +0 -0
  29. {upgini-1.1.264 → upgini-1.1.264a1}/src/upgini/data_source/data_source_publisher.py +0 -0
  30. {upgini-1.1.264 → upgini-1.1.264a1}/src/upgini/dataset.py +0 -0
  31. {upgini-1.1.264 → upgini-1.1.264a1}/src/upgini/errors.py +0 -0
  32. {upgini-1.1.264 → upgini-1.1.264a1}/src/upgini/fingerprint.js +0 -0
  33. {upgini-1.1.264 → upgini-1.1.264a1}/src/upgini/http.py +0 -0
  34. {upgini-1.1.264 → upgini-1.1.264a1}/src/upgini/mdc/__init__.py +0 -0
  35. {upgini-1.1.264 → upgini-1.1.264a1}/src/upgini/mdc/context.py +0 -0
  36. {upgini-1.1.264 → upgini-1.1.264a1}/src/upgini/metadata.py +0 -0
  37. {upgini-1.1.264 → upgini-1.1.264a1}/src/upgini/metrics.py +0 -0
  38. {upgini-1.1.264 → upgini-1.1.264a1}/src/upgini/normalizer/__init__.py +0 -0
  39. {upgini-1.1.264 → upgini-1.1.264a1}/src/upgini/normalizer/phone_normalizer.py +0 -0
  40. {upgini-1.1.264 → upgini-1.1.264a1}/src/upgini/resource_bundle/__init__.py +0 -0
  41. {upgini-1.1.264 → upgini-1.1.264a1}/src/upgini/resource_bundle/exceptions.py +0 -0
  42. {upgini-1.1.264 → upgini-1.1.264a1}/src/upgini/resource_bundle/strings_widget.properties +0 -0
  43. {upgini-1.1.264 → upgini-1.1.264a1}/src/upgini/sampler/__init__.py +0 -0
  44. {upgini-1.1.264 → upgini-1.1.264a1}/src/upgini/sampler/base.py +0 -0
  45. {upgini-1.1.264 → upgini-1.1.264a1}/src/upgini/sampler/random_under_sampler.py +0 -0
  46. {upgini-1.1.264 → upgini-1.1.264a1}/src/upgini/sampler/utils.py +0 -0
  47. {upgini-1.1.264 → upgini-1.1.264a1}/src/upgini/search_task.py +0 -0
  48. {upgini-1.1.264 → upgini-1.1.264a1}/src/upgini/spinner.py +0 -0
  49. {upgini-1.1.264 → upgini-1.1.264a1}/src/upgini/utils/__init__.py +0 -0
  50. {upgini-1.1.264 → upgini-1.1.264a1}/src/upgini/utils/base_search_key_detector.py +0 -0
  51. {upgini-1.1.264 → upgini-1.1.264a1}/src/upgini/utils/blocked_time_series.py +0 -0
  52. {upgini-1.1.264 → upgini-1.1.264a1}/src/upgini/utils/country_utils.py +0 -0
  53. {upgini-1.1.264 → upgini-1.1.264a1}/src/upgini/utils/custom_loss_utils.py +0 -0
  54. {upgini-1.1.264 → upgini-1.1.264a1}/src/upgini/utils/cv_utils.py +0 -0
  55. {upgini-1.1.264 → upgini-1.1.264a1}/src/upgini/utils/display_utils.py +0 -0
  56. {upgini-1.1.264 → upgini-1.1.264a1}/src/upgini/utils/email_utils.py +0 -0
  57. {upgini-1.1.264 → upgini-1.1.264a1}/src/upgini/utils/fallback_progress_bar.py +0 -0
  58. {upgini-1.1.264 → upgini-1.1.264a1}/src/upgini/utils/features_validator.py +0 -0
  59. {upgini-1.1.264 → upgini-1.1.264a1}/src/upgini/utils/format.py +0 -0
  60. {upgini-1.1.264 → upgini-1.1.264a1}/src/upgini/utils/ip_utils.py +0 -0
  61. {upgini-1.1.264 → upgini-1.1.264a1}/src/upgini/utils/phone_utils.py +0 -0
  62. {upgini-1.1.264 → upgini-1.1.264a1}/src/upgini/utils/postal_code_utils.py +0 -0
  63. {upgini-1.1.264 → upgini-1.1.264a1}/src/upgini/utils/progress_bar.py +0 -0
  64. {upgini-1.1.264 → upgini-1.1.264a1}/src/upgini/utils/sklearn_ext.py +0 -0
  65. {upgini-1.1.264 → upgini-1.1.264a1}/src/upgini/utils/track_info.py +0 -0
  66. {upgini-1.1.264 → upgini-1.1.264a1}/src/upgini/utils/warning_counter.py +0 -0
  67. {upgini-1.1.264 → upgini-1.1.264a1}/src/upgini/version_validator.py +0 -0
  68. {upgini-1.1.264 → upgini-1.1.264a1}/src/upgini.egg-info/SOURCES.txt +0 -0
  69. {upgini-1.1.264 → upgini-1.1.264a1}/src/upgini.egg-info/dependency_links.txt +0 -0
  70. {upgini-1.1.264 → upgini-1.1.264a1}/src/upgini.egg-info/requires.txt +0 -0
  71. {upgini-1.1.264 → upgini-1.1.264a1}/src/upgini.egg-info/top_level.txt +0 -0
  72. {upgini-1.1.264 → upgini-1.1.264a1}/tests/test_autofe_operands.py +0 -0
  73. {upgini-1.1.264 → upgini-1.1.264a1}/tests/test_binary_dataset.py +0 -0
  74. {upgini-1.1.264 → upgini-1.1.264a1}/tests/test_blocked_time_series.py +0 -0
  75. {upgini-1.1.264 → upgini-1.1.264a1}/tests/test_categorical_dataset.py +0 -0
  76. {upgini-1.1.264 → upgini-1.1.264a1}/tests/test_continuous_dataset.py +0 -0
  77. {upgini-1.1.264 → upgini-1.1.264a1}/tests/test_country_utils.py +0 -0
  78. {upgini-1.1.264 → upgini-1.1.264a1}/tests/test_custom_loss_utils.py +0 -0
  79. {upgini-1.1.264 → upgini-1.1.264a1}/tests/test_email_utils.py +0 -0
  80. {upgini-1.1.264 → upgini-1.1.264a1}/tests/test_etalon_validation.py +0 -0
  81. {upgini-1.1.264 → upgini-1.1.264a1}/tests/test_metrics.py +0 -0
  82. {upgini-1.1.264 → upgini-1.1.264a1}/tests/test_phone_utils.py +0 -0
  83. {upgini-1.1.264 → upgini-1.1.264a1}/tests/test_postal_code_utils.py +0 -0
  84. {upgini-1.1.264 → upgini-1.1.264a1}/tests/test_target_utils.py +0 -0
  85. {upgini-1.1.264 → upgini-1.1.264a1}/tests/test_widget.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: upgini
3
- Version: 1.1.264
3
+ Version: 1.1.264a1
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Home-page: https://upgini.com/
6
6
  Author: Upgini Developers
@@ -40,7 +40,7 @@ def send_log(msg: str):
40
40
 
41
41
 
42
42
  here = Path(__file__).parent.resolve()
43
- version = "1.1.264"
43
+ version = "1.1.264a1"
44
44
  try:
45
45
  send_log(f"Start setup PyLib version {version}")
46
46
  setup(
@@ -70,7 +70,6 @@ from upgini.utils.datetime_utils import (
70
70
  DateTimeSearchKeyConverter,
71
71
  is_blocked_time_series,
72
72
  is_time_series,
73
- validate_dates_distribution,
74
73
  )
75
74
  from upgini.utils.deduplicate_utils import (
76
75
  clean_full_duplicates,
@@ -1686,6 +1685,9 @@ class FeaturesEnricher(TransformerMixin):
1686
1685
  df = validated_X.copy()
1687
1686
 
1688
1687
  df[TARGET] = validated_y
1688
+
1689
+ df = clean_full_duplicates(df, logger=self.logger, silent=True, bundle=self.bundle)
1690
+
1689
1691
  num_samples = _num_samples(df)
1690
1692
  if num_samples > Dataset.FIT_SAMPLE_THRESHOLD:
1691
1693
  self.logger.info(f"Downsampling from {num_samples} to {Dataset.FIT_SAMPLE_ROWS}")
@@ -1920,6 +1922,7 @@ class FeaturesEnricher(TransformerMixin):
1920
1922
 
1921
1923
  meaning_types = {col: key.value for col, key in search_keys.items()}
1922
1924
  non_keys_columns = [column for column in df.columns if column not in search_keys.keys()]
1925
+ # Don't pass
1923
1926
  if email_converted_to_hem:
1924
1927
  non_keys_columns.append(email_column)
1925
1928
 
@@ -1941,6 +1944,7 @@ class FeaturesEnricher(TransformerMixin):
1941
1944
  if add_fit_system_record_id:
1942
1945
  df = self.__add_fit_system_record_id(df, dict(), search_keys)
1943
1946
  df = df.rename(columns={SYSTEM_RECORD_ID: SORT_ID})
1947
+ non_keys_columns.append(SORT_ID)
1944
1948
 
1945
1949
  columns_for_system_record_id = sorted(list(search_keys.keys()) + (original_features_for_transform or []))
1946
1950
 
@@ -2217,10 +2221,6 @@ class FeaturesEnricher(TransformerMixin):
2217
2221
  self.fit_search_keys = self.search_keys.copy()
2218
2222
  self.fit_search_keys = self.__prepare_search_keys(validated_X, self.fit_search_keys, is_demo_dataset)
2219
2223
 
2220
- validate_dates_distribution(
2221
- validated_X, self.fit_search_keys, self.logger, self.bundle, self.warning_counter
2222
- )
2223
-
2224
2224
  has_date = self._get_date_column(self.fit_search_keys) is not None
2225
2225
  model_task_type = self.model_task_type or define_task(validated_y, has_date, self.logger)
2226
2226
  self._validate_binary_observations(validated_y, model_task_type)
@@ -2883,26 +2883,35 @@ class FeaturesEnricher(TransformerMixin):
2883
2883
 
2884
2884
  # order by date and idempotent order by other keys
2885
2885
  if self.cv not in [CVType.time_series, CVType.blocked_time_series]:
2886
+ sort_exclude_columns = [original_order_name, ORIGINAL_INDEX, EVAL_SET_INDEX, TARGET, "__target"]
2886
2887
  if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
2887
2888
  date_column = DateTimeSearchKeyConverter.DATETIME_COL
2889
+ sort_exclude_columns.append(self._get_date_column(search_keys))
2888
2890
  else:
2889
2891
  date_column = self._get_date_column(search_keys)
2890
2892
  sort_columns = [date_column] if date_column is not None else []
2891
2893
 
2892
- other_search_keys = sorted(
2894
+ other_columns = sorted(
2893
2895
  [
2894
- sk
2895
- for sk, key_type in search_keys.items()
2896
- if key_type not in [SearchKey.DATE, SearchKey.DATETIME]
2897
- and sk in df.columns
2898
- and df[sk].nunique() > 1 # don't use constant keys for hash
2896
+ c
2897
+ for c in df.columns
2898
+ if c not in sort_columns
2899
+ and c not in sort_exclude_columns
2900
+ and df[c].nunique() > 1
2899
2901
  ]
2902
+ # [
2903
+ # sk
2904
+ # for sk, key_type in search_keys.items()
2905
+ # if key_type not in [SearchKey.DATE, SearchKey.DATETIME]
2906
+ # and sk in df.columns
2907
+ # and df[sk].nunique() > 1 # don't use constant keys for hash
2908
+ # ]
2900
2909
  )
2901
2910
 
2902
2911
  search_keys_hash = "search_keys_hash"
2903
- if len(other_search_keys) > 0:
2912
+ if len(other_columns) > 0:
2904
2913
  sort_columns.append(search_keys_hash)
2905
- df[search_keys_hash] = pd.util.hash_pandas_object(df[sorted(other_search_keys)], index=False)
2914
+ df[search_keys_hash] = pd.util.hash_pandas_object(df[other_columns], index=False)
2906
2915
 
2907
2916
  df = df.sort_values(by=sort_columns)
2908
2917
 
@@ -111,7 +111,6 @@ x_is_empty=X is empty
111
111
  y_is_empty=y is empty
112
112
  x_contains_reserved_column_name=Column name {} is reserved. Please rename column and try again
113
113
  missing_generate_feature=\nWARNING: Feature {} specified in `generate_features` is not present in input columns: {}
114
- x_unstable_by_date=\nWARNING: Your training sample is unstable in number of rows per date. It is recommended to redesign the training sample.
115
114
  # eval set validation
116
115
  unsupported_type_eval_set=Unsupported type of eval_set: {}. It should be list of tuples with two elements: X and y
117
116
  eval_set_invalid_tuple_size=eval_set contains a tuple of size {}. It should contain only pairs of X and y
@@ -146,7 +145,8 @@ dataset_too_many_rows_registered=X rows limit for transform is {}. Please sample
146
145
  dataset_empty_column_names=Some column names are empty. Add names please
147
146
  dataset_full_duplicates=\nWARNING: {:.5f}% of the rows are fully duplicated
148
147
  dataset_diff_target_duplicates=\nWARNING: {:.4f}% of rows ({}) in X and eval_set are duplicates with different y values. These rows will be deleted as incorrect\nIncorrect row indexes: {}
149
- dataset_diff_target_duplicates_fintech=\nWARNING: {:.4f}% of rows ({}) in X and eval_set are duplicates, not taking into consideration dates, IP addresses and features from the training set, but have different y values. These rows have been removed to optimize search results.\nRemoved row indexes: {}
148
+ dataset_train_diff_target_duplicates_fintech=\nWARNING: {:.4f}% of rows ({}) in X are duplicates, not taking into consideration dates, IP addresses and features from the training set, but have different y values. These rows have been removed to optimize search results.\nRemoved row indexes: {}
149
+ dataset_eval_diff_target_duplicates_fintech=\nWARNING: {:.4f}% of rows ({}) in eval{} X are duplicates, not taking into consideration dates, IP addresses and features from the training set, but have different y values. These rows have been removed to optimize search results.\nRemoved row indexes: {}
150
150
  dataset_drop_old_dates=\nWARNING: We don't have data before '2000-01-01' and removed all earlier records from the search dataset
151
151
  dataset_all_dates_old=There is empty train dataset after removing data before '2000-01-01'
152
152
  dataset_invalid_target_type=Unexpected dtype of target for binary task type: {}. Expected int or bool
@@ -1,7 +1,7 @@
1
1
  import datetime
2
2
  import logging
3
3
  import re
4
- from typing import Dict, List, Optional
4
+ from typing import List, Optional
5
5
 
6
6
  import numpy as np
7
7
  import pandas as pd
@@ -9,9 +9,7 @@ from dateutil.relativedelta import relativedelta
9
9
  from pandas.api.types import is_numeric_dtype, is_period_dtype, is_string_dtype
10
10
 
11
11
  from upgini.errors import ValidationError
12
- from upgini.metadata import SearchKey
13
12
  from upgini.resource_bundle import ResourceBundle, get_custom_bundle
14
- from upgini.utils.warning_counter import WarningCounter
15
13
 
16
14
  DATE_FORMATS = [
17
15
  "%Y-%m-%d",
@@ -227,49 +225,3 @@ def is_blocked_time_series(df: pd.DataFrame, date_col: str, search_keys: List[st
227
225
 
228
226
  is_diff_less_than_two_columns = grouped.apply(check_differences)
229
227
  return is_diff_less_than_two_columns.all()
230
-
231
-
232
- def validate_dates_distribution(
233
- X: pd.DataFrame,
234
- search_keys: Dict[str, SearchKey],
235
- logger: Optional[logging.Logger] = None,
236
- bundle: Optional[ResourceBundle] = None,
237
- warning_counter: Optional[WarningCounter] = None,
238
- ):
239
- maybe_date_col = None
240
- for key, key_type in search_keys.items():
241
- if key_type in [SearchKey.DATE, SearchKey.DATETIME]:
242
- maybe_date_col = key
243
-
244
- if maybe_date_col is None:
245
- for col in X.columns:
246
- if col in search_keys:
247
- continue
248
- try:
249
- pd.to_datetime(X[col])
250
- maybe_date_col = col
251
- break
252
- except Exception:
253
- pass
254
-
255
- if maybe_date_col is None:
256
- return
257
-
258
- dates = pd.to_datetime(X[maybe_date_col]).dt.date
259
-
260
- date_counts = dates.value_counts().sort_index()
261
-
262
- date_counts_1 = date_counts[: round(len(date_counts) / 2)]
263
- date_counts_2 = date_counts[round(len(date_counts) / 2) :]
264
- ratio = date_counts_2.mean() / date_counts_1.mean()
265
-
266
- if ratio > 1.2 or ratio < 0.8:
267
- if warning_counter is not None:
268
- warning_counter.increment()
269
- if logger is None:
270
- logger = logging.getLogger("muted_logger")
271
- logger.setLevel("FATAL")
272
- bundle = bundle or get_custom_bundle()
273
- msg = bundle.get("x_unstable_by_date")
274
- print(msg)
275
- logger.warning(msg)
@@ -3,7 +3,7 @@ from typing import Dict, List, Optional, Union
3
3
 
4
4
  import pandas as pd
5
5
 
6
- from upgini.metadata import SORT_ID, SYSTEM_RECORD_ID, TARGET, ModelTaskType, SearchKey
6
+ from upgini.metadata import EVAL_SET_INDEX, SORT_ID, SYSTEM_RECORD_ID, TARGET, ModelTaskType, SearchKey
7
7
  from upgini.resource_bundle import ResourceBundle
8
8
  from upgini.utils.datetime_utils import DateTimeSearchKeyConverter
9
9
  from upgini.utils.target_utils import define_task
@@ -78,20 +78,58 @@ def remove_fintech_duplicates(
78
78
  rows_with_diff_target = grouped_by_personal_cols.filter(has_diff_target_within_60_days)
79
79
  if len(rows_with_diff_target) > 0:
80
80
  unique_keys_to_delete = rows_with_diff_target[personal_cols].drop_duplicates()
81
- rows_to_remove = pd.merge(df.reset_index(), unique_keys_to_delete, on=personal_cols)
82
- rows_to_remove = rows_to_remove.set_index(df.index.name or "index")
83
- perc = len(rows_to_remove) * 100 / len(df)
84
- msg = bundle.get("dataset_diff_target_duplicates_fintech").format(
85
- perc, len(rows_to_remove), rows_to_remove.index.to_list()
86
- )
87
- if not silent:
88
- print(msg)
89
- if logger:
90
- logger.warning(msg)
91
- logger.info(f"Dataset shape before clean fintech duplicates: {df.shape}")
92
- df = df[~df.index.isin(rows_to_remove.index)]
93
- logger.info(f"Dataset shape after clean fintech duplicates: {df.shape}")
94
-
81
+ if EVAL_SET_INDEX not in df.columns:
82
+ rows_to_remove = pd.merge(df.reset_index(), unique_keys_to_delete, on=personal_cols)
83
+ rows_to_remove = rows_to_remove.set_index(df.index.name or "index")
84
+ perc = len(rows_to_remove) * 100 / len(df)
85
+ msg = bundle.get("dataset_train_diff_target_duplicates_fintech").format(
86
+ perc, len(rows_to_remove), rows_to_remove.index.to_list()
87
+ )
88
+ if not silent:
89
+ print(msg)
90
+ if logger:
91
+ logger.warning(msg)
92
+ logger.info(f"Dataset shape before clean fintech duplicates: {df.shape}")
93
+ df = df[~df.index.isin(rows_to_remove.index)]
94
+ logger.info(f"Dataset shape after clean fintech duplicates: {df.shape}")
95
+ else:
96
+ # Indices in train and eval_set can be the same so we remove rows from them separately
97
+ train = df.query(f"{EVAL_SET_INDEX} == 0")
98
+ train_rows_to_remove = pd.merge(train.reset_index(), unique_keys_to_delete, on=personal_cols)
99
+ train_rows_to_remove = train_rows_to_remove.set_index(train.index.name or "index")
100
+ train_perc = len(train_rows_to_remove) * 100 / len(train)
101
+ msg = bundle.get("dataset_train_diff_target_duplicates_fintech").format(
102
+ train_perc, len(train_rows_to_remove), train_rows_to_remove.index.to_list()
103
+ )
104
+ if not silent:
105
+ print(msg)
106
+ if logger:
107
+ logger.warning(msg)
108
+ logger.info(f"Train dataset shape before clean fintech duplicates: {train.shape}")
109
+ train = train[~train.index.isin(train_rows_to_remove.index)]
110
+ logger.info(f"Train dataset shape after clean fintech duplicates: {train.shape}")
111
+
112
+ evals = [df.query(f"{EVAL_SET_INDEX} == {i}") for i in df[EVAL_SET_INDEX].unique() if i != 0]
113
+ new_evals = []
114
+ for i, eval in enumerate(evals):
115
+ eval_rows_to_remove = pd.merge(eval.reset_index(), unique_keys_to_delete, on=personal_cols)
116
+ eval_rows_to_remove = eval_rows_to_remove.set_index(eval.index.name or "index")
117
+ eval_perc = len(eval_rows_to_remove) * 100 / len(eval)
118
+ msg = bundle.get("dataset_eval_diff_target_duplicates_fintech").format(
119
+ eval_perc, len(eval_rows_to_remove), i + 1, eval_rows_to_remove.index.to_list()
120
+ )
121
+ if not silent:
122
+ print(msg)
123
+ if logger:
124
+ logger.warning(msg)
125
+ logger.info(f"Eval {i + 1} dataset shape before clean fintech duplicates: {eval.shape}")
126
+ eval = eval[~eval.index.isin(eval_rows_to_remove.index)]
127
+ logger.info(f"Eval {i + 1} dataset shape after clean fintech duplicates: {eval.shape}")
128
+ new_evals.append(eval)
129
+
130
+ logger.info(f"Dataset shape before clean fintech duplicates: {df.shape}")
131
+ df = pd.concat([train] + new_evals)
132
+ logger.info(f"Dataset shape after clean fintech duplicates: {df.shape}")
95
133
  return df
96
134
 
97
135
 
@@ -101,14 +139,18 @@ def clean_full_duplicates(
101
139
  nrows = len(df)
102
140
  if nrows == 0:
103
141
  return df
104
- # Remove absolute duplicates (exclude system_record_id)
142
+ # Remove full duplicates (exclude system_record_id, sort_id and eval_set_index)
105
143
  unique_columns = df.columns.tolist()
106
144
  if SYSTEM_RECORD_ID in unique_columns:
107
145
  unique_columns.remove(SYSTEM_RECORD_ID)
108
146
  if SORT_ID in unique_columns:
109
147
  unique_columns.remove(SORT_ID)
148
+ if EVAL_SET_INDEX in unique_columns:
149
+ unique_columns.remove(EVAL_SET_INDEX)
110
150
  logger.info(f"Dataset shape before clean duplicates: {df.shape}")
111
- df = df.drop_duplicates(subset=unique_columns)
151
+ # Train segment goes first so if duplicates are found in train and eval set
152
+ # then we keep unique rows in train segment
153
+ df = df.drop_duplicates(subset=unique_columns, keep="first")
112
154
  logger.info(f"Dataset shape after clean duplicates: {df.shape}")
113
155
  nrows_after_full_dedup = len(df)
114
156
  share_full_dedup = 100 * (1 - nrows_after_full_dedup / nrows)
@@ -123,7 +165,7 @@ def clean_full_duplicates(
123
165
  marked_duplicates = df.duplicated(subset=unique_columns, keep=False)
124
166
  if marked_duplicates.sum() > 0:
125
167
  dups_indices = df[marked_duplicates].index.to_list()
126
- nrows_after_tgt_dedup = len(df.drop_duplicates(subset=unique_columns))
168
+ nrows_after_tgt_dedup = len(df.drop_duplicates(subset=unique_columns, keep=False))
127
169
  num_dup_rows = nrows_after_full_dedup - nrows_after_tgt_dedup
128
170
  share_tgt_dedup = 100 * num_dup_rows / nrows_after_full_dedup
129
171
 
@@ -133,6 +175,7 @@ def clean_full_duplicates(
133
175
  print(msg)
134
176
  df = df.drop_duplicates(subset=unique_columns, keep=False)
135
177
  logger.info(f"Dataset shape after clean invalid target duplicates: {df.shape}")
178
+
136
179
  return df
137
180
 
138
181
 
@@ -132,7 +132,9 @@ def balance_undersample(
132
132
  class_value = classes[class_idx]
133
133
  class_count = vc[class_value]
134
134
  sample_strategy[class_value] = min(class_count, quantile25_class_cnt * multiclass_bootstrap_loops)
135
- sampler = RandomUnderSampler(sampling_strategy=sample_strategy, random_state=random_state)
135
+ sampler = RandomUnderSampler(
136
+ sampling_strategy=sample_strategy, random_state=random_state
137
+ )
136
138
  X = df[SYSTEM_RECORD_ID]
137
139
  X = X.to_frame(SYSTEM_RECORD_ID)
138
140
  new_x, _ = sampler.fit_resample(X, target) # type: ignore
@@ -151,7 +153,9 @@ def balance_undersample(
151
153
  minority_class = df[df[target_column] == min_class_value]
152
154
  majority_class = df[df[target_column] != min_class_value]
153
155
  sample_size = min(len(majority_class), min_sample_threshold - min_class_count)
154
- sampled_majority_class = majority_class.sample(n=sample_size, random_state=random_state)
156
+ sampled_majority_class = majority_class.sample(
157
+ n=sample_size, random_state=random_state
158
+ )
155
159
  resampled_data = df[
156
160
  (df[SYSTEM_RECORD_ID].isin(minority_class[SYSTEM_RECORD_ID]))
157
161
  | (df[SYSTEM_RECORD_ID].isin(sampled_majority_class[SYSTEM_RECORD_ID]))
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: upgini
3
- Version: 1.1.264
3
+ Version: 1.1.264a1
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Home-page: https://upgini.com/
6
6
  Author: Upgini Developers
@@ -1,13 +1,7 @@
1
- import numpy as np
2
1
  import pandas as pd
2
+ import numpy as np
3
3
 
4
- from upgini.metadata import SearchKey
5
- from upgini.utils.datetime_utils import (
6
- is_blocked_time_series,
7
- is_time_series,
8
- validate_dates_distribution,
9
- )
10
- from upgini.utils.warning_counter import WarningCounter
4
+ from upgini.utils.datetime_utils import is_blocked_time_series, is_time_series
11
5
 
12
6
  pd.set_option("mode.chained_assignment", "raise")
13
7
 
@@ -189,25 +183,3 @@ def test_multivariate_time_series():
189
183
  assert not is_blocked_time_series(df, "date", ["date"])
190
184
 
191
185
  assert is_blocked_time_series(df, "date", ["date", "feature3"])
192
-
193
-
194
- def test_validate_dates_distribution():
195
- df = pd.DataFrame({"date": ["2020-01-01"] * 10 + ["2020-02-01"] * 20 + ["2020-03-01"] * 30 + ["2020-04-01"] * 40})
196
- warning_counter = WarningCounter()
197
- validate_dates_distribution(df, {}, warning_counter=warning_counter)
198
- assert warning_counter.has_warnings()
199
-
200
- df = pd.DataFrame({"date": ["2020-05-01"] * 10 + ["2020-02-01"] * 20 + ["2020-03-01"] * 30 + ["2020-04-01"] * 40})
201
- warning_counter = WarningCounter()
202
- validate_dates_distribution(df, {}, warning_counter=warning_counter)
203
- assert not warning_counter.has_warnings()
204
-
205
- df = pd.DataFrame(
206
- {
207
- "date2": ["2020-05-01"] * 10 + ["2020-02-01"] * 20 + ["2020-03-01"] * 30 + ["2020-04-01"] * 40,
208
- "date1": ["2020-01-01"] * 10 + ["2020-02-01"] * 20 + ["2020-03-01"] * 30 + ["2020-04-01"] * 40,
209
- }
210
- )
211
- warning_counter = WarningCounter()
212
- validate_dates_distribution(df, {"date1": SearchKey.DATE}, warning_counter=warning_counter)
213
- assert warning_counter.has_warnings()
@@ -2164,8 +2164,6 @@ def test_idempotent_order_with_imbalanced_dataset(requests_mock: Mocker):
2164
2164
 
2165
2165
  actual_result_df = result_wrapper.df.sort_values(by="system_record_id").reset_index(drop=True)
2166
2166
  # actual_result_df.to_parquet(expected_result_path)
2167
- actual_result_df["phone_num_a54a33"] = actual_result_df["phone_num_a54a33"].astype("Int64")
2168
- actual_result_df["rep_date_f5d6bb"] = actual_result_df["rep_date_f5d6bb"].astype("Int64")
2169
2167
  assert_frame_equal(actual_result_df, expected_result_df)
2170
2168
 
2171
2169
  for i in range(5):
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes