upgini 1.1.263a1__tar.gz → 1.1.264__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. {upgini-1.1.263a1/src/upgini.egg-info → upgini-1.1.264}/PKG-INFO +1 -1
  2. {upgini-1.1.263a1 → upgini-1.1.264}/setup.py +1 -1
  3. {upgini-1.1.263a1 → upgini-1.1.264}/src/upgini/autofe/all_operands.py +3 -0
  4. upgini-1.1.264/src/upgini/autofe/date.py +53 -0
  5. {upgini-1.1.263a1 → upgini-1.1.264}/src/upgini/autofe/feature.py +1 -1
  6. {upgini-1.1.263a1 → upgini-1.1.264}/src/upgini/autofe/operand.py +2 -0
  7. {upgini-1.1.263a1 → upgini-1.1.264}/src/upgini/autofe/unary.py +15 -8
  8. {upgini-1.1.263a1 → upgini-1.1.264}/src/upgini/autofe/vector.py +5 -3
  9. {upgini-1.1.263a1 → upgini-1.1.264}/src/upgini/features_enricher.py +13 -22
  10. {upgini-1.1.263a1 → upgini-1.1.264}/src/upgini/resource_bundle/strings.properties +2 -2
  11. {upgini-1.1.263a1 → upgini-1.1.264}/src/upgini/utils/datetime_utils.py +49 -1
  12. {upgini-1.1.263a1 → upgini-1.1.264}/src/upgini/utils/deduplicate_utils.py +18 -61
  13. {upgini-1.1.263a1 → upgini-1.1.264}/src/upgini/utils/target_utils.py +2 -6
  14. {upgini-1.1.263a1 → upgini-1.1.264/src/upgini.egg-info}/PKG-INFO +1 -1
  15. {upgini-1.1.263a1 → upgini-1.1.264}/src/upgini.egg-info/SOURCES.txt +2 -0
  16. upgini-1.1.264/tests/test_autofe_operands.py +27 -0
  17. {upgini-1.1.263a1 → upgini-1.1.264}/tests/test_datetime_utils.py +30 -2
  18. {upgini-1.1.263a1 → upgini-1.1.264}/tests/test_features_enricher.py +2 -0
  19. {upgini-1.1.263a1 → upgini-1.1.264}/LICENSE +0 -0
  20. {upgini-1.1.263a1 → upgini-1.1.264}/README.md +0 -0
  21. {upgini-1.1.263a1 → upgini-1.1.264}/pyproject.toml +0 -0
  22. {upgini-1.1.263a1 → upgini-1.1.264}/setup.cfg +0 -0
  23. {upgini-1.1.263a1 → upgini-1.1.264}/src/upgini/__init__.py +0 -0
  24. {upgini-1.1.263a1 → upgini-1.1.264}/src/upgini/ads.py +0 -0
  25. {upgini-1.1.263a1 → upgini-1.1.264}/src/upgini/ads_management/__init__.py +0 -0
  26. {upgini-1.1.263a1 → upgini-1.1.264}/src/upgini/ads_management/ads_manager.py +0 -0
  27. {upgini-1.1.263a1 → upgini-1.1.264}/src/upgini/autofe/__init__.py +0 -0
  28. {upgini-1.1.263a1 → upgini-1.1.264}/src/upgini/autofe/binary.py +0 -0
  29. {upgini-1.1.263a1 → upgini-1.1.264}/src/upgini/autofe/groupby.py +0 -0
  30. {upgini-1.1.263a1 → upgini-1.1.264}/src/upgini/data_source/__init__.py +0 -0
  31. {upgini-1.1.263a1 → upgini-1.1.264}/src/upgini/data_source/data_source_publisher.py +0 -0
  32. {upgini-1.1.263a1 → upgini-1.1.264}/src/upgini/dataset.py +0 -0
  33. {upgini-1.1.263a1 → upgini-1.1.264}/src/upgini/errors.py +0 -0
  34. {upgini-1.1.263a1 → upgini-1.1.264}/src/upgini/fingerprint.js +0 -0
  35. {upgini-1.1.263a1 → upgini-1.1.264}/src/upgini/http.py +0 -0
  36. {upgini-1.1.263a1 → upgini-1.1.264}/src/upgini/mdc/__init__.py +0 -0
  37. {upgini-1.1.263a1 → upgini-1.1.264}/src/upgini/mdc/context.py +0 -0
  38. {upgini-1.1.263a1 → upgini-1.1.264}/src/upgini/metadata.py +0 -0
  39. {upgini-1.1.263a1 → upgini-1.1.264}/src/upgini/metrics.py +0 -0
  40. {upgini-1.1.263a1 → upgini-1.1.264}/src/upgini/normalizer/__init__.py +0 -0
  41. {upgini-1.1.263a1 → upgini-1.1.264}/src/upgini/normalizer/phone_normalizer.py +0 -0
  42. {upgini-1.1.263a1 → upgini-1.1.264}/src/upgini/resource_bundle/__init__.py +0 -0
  43. {upgini-1.1.263a1 → upgini-1.1.264}/src/upgini/resource_bundle/exceptions.py +0 -0
  44. {upgini-1.1.263a1 → upgini-1.1.264}/src/upgini/resource_bundle/strings_widget.properties +0 -0
  45. {upgini-1.1.263a1 → upgini-1.1.264}/src/upgini/sampler/__init__.py +0 -0
  46. {upgini-1.1.263a1 → upgini-1.1.264}/src/upgini/sampler/base.py +0 -0
  47. {upgini-1.1.263a1 → upgini-1.1.264}/src/upgini/sampler/random_under_sampler.py +0 -0
  48. {upgini-1.1.263a1 → upgini-1.1.264}/src/upgini/sampler/utils.py +0 -0
  49. {upgini-1.1.263a1 → upgini-1.1.264}/src/upgini/search_task.py +0 -0
  50. {upgini-1.1.263a1 → upgini-1.1.264}/src/upgini/spinner.py +0 -0
  51. {upgini-1.1.263a1 → upgini-1.1.264}/src/upgini/utils/__init__.py +0 -0
  52. {upgini-1.1.263a1 → upgini-1.1.264}/src/upgini/utils/base_search_key_detector.py +0 -0
  53. {upgini-1.1.263a1 → upgini-1.1.264}/src/upgini/utils/blocked_time_series.py +0 -0
  54. {upgini-1.1.263a1 → upgini-1.1.264}/src/upgini/utils/country_utils.py +0 -0
  55. {upgini-1.1.263a1 → upgini-1.1.264}/src/upgini/utils/custom_loss_utils.py +0 -0
  56. {upgini-1.1.263a1 → upgini-1.1.264}/src/upgini/utils/cv_utils.py +0 -0
  57. {upgini-1.1.263a1 → upgini-1.1.264}/src/upgini/utils/display_utils.py +0 -0
  58. {upgini-1.1.263a1 → upgini-1.1.264}/src/upgini/utils/email_utils.py +0 -0
  59. {upgini-1.1.263a1 → upgini-1.1.264}/src/upgini/utils/fallback_progress_bar.py +0 -0
  60. {upgini-1.1.263a1 → upgini-1.1.264}/src/upgini/utils/features_validator.py +0 -0
  61. {upgini-1.1.263a1 → upgini-1.1.264}/src/upgini/utils/format.py +0 -0
  62. {upgini-1.1.263a1 → upgini-1.1.264}/src/upgini/utils/ip_utils.py +0 -0
  63. {upgini-1.1.263a1 → upgini-1.1.264}/src/upgini/utils/phone_utils.py +0 -0
  64. {upgini-1.1.263a1 → upgini-1.1.264}/src/upgini/utils/postal_code_utils.py +0 -0
  65. {upgini-1.1.263a1 → upgini-1.1.264}/src/upgini/utils/progress_bar.py +0 -0
  66. {upgini-1.1.263a1 → upgini-1.1.264}/src/upgini/utils/sklearn_ext.py +0 -0
  67. {upgini-1.1.263a1 → upgini-1.1.264}/src/upgini/utils/track_info.py +0 -0
  68. {upgini-1.1.263a1 → upgini-1.1.264}/src/upgini/utils/warning_counter.py +0 -0
  69. {upgini-1.1.263a1 → upgini-1.1.264}/src/upgini/version_validator.py +0 -0
  70. {upgini-1.1.263a1 → upgini-1.1.264}/src/upgini.egg-info/dependency_links.txt +0 -0
  71. {upgini-1.1.263a1 → upgini-1.1.264}/src/upgini.egg-info/requires.txt +0 -0
  72. {upgini-1.1.263a1 → upgini-1.1.264}/src/upgini.egg-info/top_level.txt +0 -0
  73. {upgini-1.1.263a1 → upgini-1.1.264}/tests/test_binary_dataset.py +0 -0
  74. {upgini-1.1.263a1 → upgini-1.1.264}/tests/test_blocked_time_series.py +0 -0
  75. {upgini-1.1.263a1 → upgini-1.1.264}/tests/test_categorical_dataset.py +0 -0
  76. {upgini-1.1.263a1 → upgini-1.1.264}/tests/test_continuous_dataset.py +0 -0
  77. {upgini-1.1.263a1 → upgini-1.1.264}/tests/test_country_utils.py +0 -0
  78. {upgini-1.1.263a1 → upgini-1.1.264}/tests/test_custom_loss_utils.py +0 -0
  79. {upgini-1.1.263a1 → upgini-1.1.264}/tests/test_email_utils.py +0 -0
  80. {upgini-1.1.263a1 → upgini-1.1.264}/tests/test_etalon_validation.py +0 -0
  81. {upgini-1.1.263a1 → upgini-1.1.264}/tests/test_metrics.py +0 -0
  82. {upgini-1.1.263a1 → upgini-1.1.264}/tests/test_phone_utils.py +0 -0
  83. {upgini-1.1.263a1 → upgini-1.1.264}/tests/test_postal_code_utils.py +0 -0
  84. {upgini-1.1.263a1 → upgini-1.1.264}/tests/test_target_utils.py +0 -0
  85. {upgini-1.1.263a1 → upgini-1.1.264}/tests/test_widget.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: upgini
3
- Version: 1.1.263a1
3
+ Version: 1.1.264
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Home-page: https://upgini.com/
6
6
  Author: Upgini Developers
@@ -40,7 +40,7 @@ def send_log(msg: str):
40
40
 
41
41
 
42
42
  here = Path(__file__).parent.resolve()
43
- version = "1.1.263a1"
43
+ version = "1.1.264"
44
44
  try:
45
45
  send_log(f"Start setup PyLib version {version}")
46
46
  setup(
@@ -1,4 +1,5 @@
1
1
  from typing import Dict
2
+ from upgini.autofe.date import DateDiff, DateDiffType2
2
3
  from upgini.autofe.groupby import GroupByThenAgg, GroupByThenRank
3
4
  from upgini.autofe.operand import Operand
4
5
  from upgini.autofe.unary import Abs, Log, Residual, Sqrt, Square, Sigmoid, Floor, Freq
@@ -35,6 +36,8 @@ ALL_OPERANDS: Dict[str, Operand] = {
35
36
  Operand(name="GroupByThenNUnique", output_type="int", is_vectorizable=True, is_grouping=True),
36
37
  Operand(name="GroupByThenFreq", output_type="float", is_grouping=True),
37
38
  Sim(),
39
+ DateDiff(),
40
+ DateDiffType2(),
38
41
  ]
39
42
  }
40
43
 
@@ -0,0 +1,53 @@
1
+ from typing import Optional, Union
2
+ import numpy as np
3
+ import pandas as pd
4
+
5
+ from upgini.autofe.operand import PandasOperand
6
+
7
+
8
+ class DateDiffMixin:
9
+ diff_unit: str = "D"
10
+ left_unit: Optional[str] = None
11
+ right_unit: Optional[str] = None
12
+
13
+ def _convert_to_date(
14
+ self, x: Union[pd.DataFrame, pd.Series], unit: Optional[str]
15
+ ) -> Union[pd.DataFrame, pd.Series]:
16
+ if isinstance(x, pd.DataFrame):
17
+ return x.apply(lambda y: self._convert_to_date(y, unit), axis=1)
18
+
19
+ return pd.to_datetime(x, unit=unit)
20
+
21
+
22
+ class DateDiff(PandasOperand, DateDiffMixin):
23
+ name = "date_diff"
24
+ is_binary = True
25
+ has_symmetry_importance = True
26
+
27
+ def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
28
+ left = self._convert_to_date(left, self.left_unit)
29
+ right = self._convert_to_date(right, self.right_unit)
30
+ return self.__replace_negative((left - right) / np.timedelta64(1, self.diff_unit))
31
+
32
+ def __replace_negative(self, x: Union[pd.DataFrame, pd.Series]):
33
+ x[x < 0] = None
34
+ return x
35
+
36
+
37
+ class DateDiffType2(PandasOperand, DateDiffMixin):
38
+ name = "date_diff_type2"
39
+ is_binary = True
40
+ has_symmetry_importance = True
41
+ is_vectorizable = False
42
+
43
+ def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
44
+ left = self._convert_to_date(left, self.left_unit)
45
+ right = self._convert_to_date(right, self.right_unit)
46
+ future = right + (left.dt.year - right.dt.year).apply(
47
+ lambda y: np.datetime64("NaT") if np.isnan(y) else pd.tseries.offsets.DateOffset(years=y)
48
+ )
49
+ before = future[future < left]
50
+ future[future < left] = before + pd.tseries.offsets.DateOffset(years=1)
51
+ diff = (future - left) / np.timedelta64(1, self.diff_unit)
52
+
53
+ return diff
@@ -305,7 +305,7 @@ class FeatureGroup:
305
305
  grouped_features = []
306
306
 
307
307
  def groupby_func(f: Feature) -> Tuple[Operand, Union[Column, Feature]]:
308
- return (f.op, f.children[0] if f.op.is_unary or f.op.is_vector else f.children[1])
308
+ return (f.op, f.children[0 if not f.op.is_vectorizable else f.op.group_index])
309
309
 
310
310
  for op_child, features in itertools.groupby(candidates, groupby_func):
311
311
  op, main_child = op_child
@@ -73,6 +73,8 @@ class PandasOperand(Operand, abc.ABC):
73
73
 
74
74
 
75
75
  class VectorizableMixin(Operand):
76
+ group_index: int = 1
77
+
76
78
  def validate_calculation(self, input_columns: List[str], **kwargs) -> Tuple[str, List[str]]:
77
79
  if not kwargs.get(MAIN_COLUMN):
78
80
  raise ValueError(f"Expected argument {MAIN_COLUMN} for grouping operator {self.name} not found")
@@ -1,12 +1,13 @@
1
- from upgini.autofe.operand import PandasOperand
1
+ from upgini.autofe.operand import PandasOperand, VectorizableMixin
2
2
  import numpy as np
3
3
  import pandas as pd
4
4
 
5
5
 
6
- class Abs(PandasOperand):
6
+ class Abs(PandasOperand, VectorizableMixin):
7
7
  name = "abs"
8
8
  is_unary = True
9
9
  is_vectorizable = True
10
+ group_index = 0
10
11
 
11
12
  def calculate_unary(self, data: pd.Series) -> pd.Series:
12
13
  return data.abs()
@@ -15,11 +16,12 @@ class Abs(PandasOperand):
15
16
  return data.abs()
16
17
 
17
18
 
18
- class Log(PandasOperand):
19
+ class Log(PandasOperand, VectorizableMixin):
19
20
  name = "log"
20
21
  is_unary = True
21
22
  is_vectorizable = True
22
23
  output_type = "float"
24
+ group_index = 0
23
25
 
24
26
  def calculate_unary(self, data: pd.Series) -> pd.Series:
25
27
  return self._round_value(np.log(np.abs(data.replace(0, np.nan))), 10)
@@ -28,11 +30,12 @@ class Log(PandasOperand):
28
30
  return self._round_value(np.log(data.replace(0, np.nan).abs()), 10)
29
31
 
30
32
 
31
- class Sqrt(PandasOperand):
33
+ class Sqrt(PandasOperand, VectorizableMixin):
32
34
  name = "sqrt"
33
35
  is_unary = True
34
36
  is_vectorizable = True
35
37
  output_type = "float"
38
+ group_index = 0
36
39
 
37
40
  def calculate_unary(self, data: pd.Series) -> pd.Series:
38
41
  return self._round_value(np.sqrt(np.abs(data)))
@@ -41,10 +44,11 @@ class Sqrt(PandasOperand):
41
44
  return self._round_value(np.sqrt(data.abs()))
42
45
 
43
46
 
44
- class Square(PandasOperand):
47
+ class Square(PandasOperand, VectorizableMixin):
45
48
  name = "square"
46
49
  is_unary = True
47
50
  is_vectorizable = True
51
+ group_index = 0
48
52
 
49
53
  def calculate_unary(self, data: pd.Series) -> pd.Series:
50
54
  return np.square(data)
@@ -53,11 +57,12 @@ class Square(PandasOperand):
53
57
  return np.square(data)
54
58
 
55
59
 
56
- class Sigmoid(PandasOperand):
60
+ class Sigmoid(PandasOperand, VectorizableMixin):
57
61
  name = "sigmoid"
58
62
  is_unary = True
59
63
  is_vectorizable = True
60
64
  output_type = "float"
65
+ group_index = 0
61
66
 
62
67
  def calculate_unary(self, data: pd.Series) -> pd.Series:
63
68
  return self._round_value(1 / (1 + np.exp(-data)))
@@ -66,12 +71,13 @@ class Sigmoid(PandasOperand):
66
71
  return self._round_value(1 / (1 + np.exp(-data)))
67
72
 
68
73
 
69
- class Floor(PandasOperand):
74
+ class Floor(PandasOperand, VectorizableMixin):
70
75
  name = "floor"
71
76
  is_unary = True
72
77
  is_vectorizable = True
73
78
  output_type = "int"
74
79
  input_type = "continuous"
80
+ group_index = 0
75
81
 
76
82
  def calculate_unary(self, data: pd.Series) -> pd.Series:
77
83
  return np.floor(data)
@@ -80,11 +86,12 @@ class Floor(PandasOperand):
80
86
  return np.floor(data)
81
87
 
82
88
 
83
- class Residual(PandasOperand):
89
+ class Residual(PandasOperand, VectorizableMixin):
84
90
  name = "residual"
85
91
  is_unary = True
86
92
  is_vectorizable = True
87
93
  input_type = "continuous"
94
+ group_index = 0
88
95
 
89
96
  def calculate_unary(self, data: pd.Series) -> pd.Series:
90
97
  return data - np.floor(data)
@@ -1,20 +1,22 @@
1
1
  from typing import List
2
2
  import pandas as pd
3
- from upgini.autofe.operand import PandasOperand
3
+ from upgini.autofe.operand import PandasOperand, VectorizableMixin
4
4
 
5
5
 
6
- class Mean(PandasOperand):
6
+ class Mean(PandasOperand, VectorizableMixin):
7
7
  name = "mean"
8
8
  output_type = "float"
9
9
  is_vector = True
10
+ group_index = 0
10
11
 
11
12
  def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
12
13
  return pd.DataFrame(data).T.fillna(0).mean(axis=1)
13
14
 
14
15
 
15
- class Sum(PandasOperand):
16
+ class Sum(PandasOperand, VectorizableMixin):
16
17
  name = "sum"
17
18
  is_vector = True
19
+ group_index = 0
18
20
 
19
21
  def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
20
22
  return pd.DataFrame(data).T.fillna(0).sum(axis=1)
@@ -70,6 +70,7 @@ from upgini.utils.datetime_utils import (
70
70
  DateTimeSearchKeyConverter,
71
71
  is_blocked_time_series,
72
72
  is_time_series,
73
+ validate_dates_distribution,
73
74
  )
74
75
  from upgini.utils.deduplicate_utils import (
75
76
  clean_full_duplicates,
@@ -1685,9 +1686,6 @@ class FeaturesEnricher(TransformerMixin):
1685
1686
  df = validated_X.copy()
1686
1687
 
1687
1688
  df[TARGET] = validated_y
1688
-
1689
- df = clean_full_duplicates(df, logger=self.logger, silent=True, bundle=self.bundle)
1690
-
1691
1689
  num_samples = _num_samples(df)
1692
1690
  if num_samples > Dataset.FIT_SAMPLE_THRESHOLD:
1693
1691
  self.logger.info(f"Downsampling from {num_samples} to {Dataset.FIT_SAMPLE_ROWS}")
@@ -1922,7 +1920,6 @@ class FeaturesEnricher(TransformerMixin):
1922
1920
 
1923
1921
  meaning_types = {col: key.value for col, key in search_keys.items()}
1924
1922
  non_keys_columns = [column for column in df.columns if column not in search_keys.keys()]
1925
- # Don't pass
1926
1923
  if email_converted_to_hem:
1927
1924
  non_keys_columns.append(email_column)
1928
1925
 
@@ -1944,7 +1941,6 @@ class FeaturesEnricher(TransformerMixin):
1944
1941
  if add_fit_system_record_id:
1945
1942
  df = self.__add_fit_system_record_id(df, dict(), search_keys)
1946
1943
  df = df.rename(columns={SYSTEM_RECORD_ID: SORT_ID})
1947
- non_keys_columns.append(SORT_ID)
1948
1944
 
1949
1945
  columns_for_system_record_id = sorted(list(search_keys.keys()) + (original_features_for_transform or []))
1950
1946
 
@@ -2221,6 +2217,10 @@ class FeaturesEnricher(TransformerMixin):
2221
2217
  self.fit_search_keys = self.search_keys.copy()
2222
2218
  self.fit_search_keys = self.__prepare_search_keys(validated_X, self.fit_search_keys, is_demo_dataset)
2223
2219
 
2220
+ validate_dates_distribution(
2221
+ validated_X, self.fit_search_keys, self.logger, self.bundle, self.warning_counter
2222
+ )
2223
+
2224
2224
  has_date = self._get_date_column(self.fit_search_keys) is not None
2225
2225
  model_task_type = self.model_task_type or define_task(validated_y, has_date, self.logger)
2226
2226
  self._validate_binary_observations(validated_y, model_task_type)
@@ -2883,35 +2883,26 @@ class FeaturesEnricher(TransformerMixin):
2883
2883
 
2884
2884
  # order by date and idempotent order by other keys
2885
2885
  if self.cv not in [CVType.time_series, CVType.blocked_time_series]:
2886
- sort_exclude_columns = [original_order_name, ORIGINAL_INDEX, EVAL_SET_INDEX, TARGET, "__target"]
2887
2886
  if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
2888
2887
  date_column = DateTimeSearchKeyConverter.DATETIME_COL
2889
- sort_exclude_columns.append(self._get_date_column(search_keys))
2890
2888
  else:
2891
2889
  date_column = self._get_date_column(search_keys)
2892
2890
  sort_columns = [date_column] if date_column is not None else []
2893
2891
 
2894
- other_columns = sorted(
2892
+ other_search_keys = sorted(
2895
2893
  [
2896
- c
2897
- for c in df.columns
2898
- if c not in sort_columns
2899
- and c not in sort_exclude_columns
2900
- and df[c].nunique() > 1
2894
+ sk
2895
+ for sk, key_type in search_keys.items()
2896
+ if key_type not in [SearchKey.DATE, SearchKey.DATETIME]
2897
+ and sk in df.columns
2898
+ and df[sk].nunique() > 1 # don't use constant keys for hash
2901
2899
  ]
2902
- # [
2903
- # sk
2904
- # for sk, key_type in search_keys.items()
2905
- # if key_type not in [SearchKey.DATE, SearchKey.DATETIME]
2906
- # and sk in df.columns
2907
- # and df[sk].nunique() > 1 # don't use constant keys for hash
2908
- # ]
2909
2900
  )
2910
2901
 
2911
2902
  search_keys_hash = "search_keys_hash"
2912
- if len(other_columns) > 0:
2903
+ if len(other_search_keys) > 0:
2913
2904
  sort_columns.append(search_keys_hash)
2914
- df[search_keys_hash] = pd.util.hash_pandas_object(df[other_columns], index=False)
2905
+ df[search_keys_hash] = pd.util.hash_pandas_object(df[sorted(other_search_keys)], index=False)
2915
2906
 
2916
2907
  df = df.sort_values(by=sort_columns)
2917
2908
 
@@ -111,6 +111,7 @@ x_is_empty=X is empty
111
111
  y_is_empty=y is empty
112
112
  x_contains_reserved_column_name=Column name {} is reserved. Please rename column and try again
113
113
  missing_generate_feature=\nWARNING: Feature {} specified in `generate_features` is not present in input columns: {}
114
+ x_unstable_by_date=\nWARNING: Your training sample is unstable in number of rows per date. It is recommended to redesign the training sample.
114
115
  # eval set validation
115
116
  unsupported_type_eval_set=Unsupported type of eval_set: {}. It should be list of tuples with two elements: X and y
116
117
  eval_set_invalid_tuple_size=eval_set contains a tuple of size {}. It should contain only pairs of X and y
@@ -145,8 +146,7 @@ dataset_too_many_rows_registered=X rows limit for transform is {}. Please sample
145
146
  dataset_empty_column_names=Some column names are empty. Add names please
146
147
  dataset_full_duplicates=\nWARNING: {:.5f}% of the rows are fully duplicated
147
148
  dataset_diff_target_duplicates=\nWARNING: {:.4f}% of rows ({}) in X and eval_set are duplicates with different y values. These rows will be deleted as incorrect\nIncorrect row indexes: {}
148
- dataset_train_diff_target_duplicates_fintech=\nWARNING: {:.4f}% of rows ({}) in X are duplicates, not taking into consideration dates, IP addresses and features from the training set, but have different y values. These rows have been removed to optimize search results.\nRemoved row indexes: {}
149
- dataset_eval_diff_target_duplicates_fintech=\nWARNING: {:.4f}% of rows ({}) in eval{} X are duplicates, not taking into consideration dates, IP addresses and features from the training set, but have different y values. These rows have been removed to optimize search results.\nRemoved row indexes: {}
149
+ dataset_diff_target_duplicates_fintech=\nWARNING: {:.4f}% of rows ({}) in X and eval_set are duplicates, not taking into consideration dates, IP addresses and features from the training set, but have different y values. These rows have been removed to optimize search results.\nRemoved row indexes: {}
150
150
  dataset_drop_old_dates=\nWARNING: We don't have data before '2000-01-01' and removed all earlier records from the search dataset
151
151
  dataset_all_dates_old=There is empty train dataset after removing data before '2000-01-01'
152
152
  dataset_invalid_target_type=Unexpected dtype of target for binary task type: {}. Expected int or bool
@@ -1,7 +1,7 @@
1
1
  import datetime
2
2
  import logging
3
3
  import re
4
- from typing import List, Optional
4
+ from typing import Dict, List, Optional
5
5
 
6
6
  import numpy as np
7
7
  import pandas as pd
@@ -9,7 +9,9 @@ from dateutil.relativedelta import relativedelta
9
9
  from pandas.api.types import is_numeric_dtype, is_period_dtype, is_string_dtype
10
10
 
11
11
  from upgini.errors import ValidationError
12
+ from upgini.metadata import SearchKey
12
13
  from upgini.resource_bundle import ResourceBundle, get_custom_bundle
14
+ from upgini.utils.warning_counter import WarningCounter
13
15
 
14
16
  DATE_FORMATS = [
15
17
  "%Y-%m-%d",
@@ -225,3 +227,49 @@ def is_blocked_time_series(df: pd.DataFrame, date_col: str, search_keys: List[st
225
227
 
226
228
  is_diff_less_than_two_columns = grouped.apply(check_differences)
227
229
  return is_diff_less_than_two_columns.all()
230
+
231
+
232
+ def validate_dates_distribution(
233
+ X: pd.DataFrame,
234
+ search_keys: Dict[str, SearchKey],
235
+ logger: Optional[logging.Logger] = None,
236
+ bundle: Optional[ResourceBundle] = None,
237
+ warning_counter: Optional[WarningCounter] = None,
238
+ ):
239
+ maybe_date_col = None
240
+ for key, key_type in search_keys.items():
241
+ if key_type in [SearchKey.DATE, SearchKey.DATETIME]:
242
+ maybe_date_col = key
243
+
244
+ if maybe_date_col is None:
245
+ for col in X.columns:
246
+ if col in search_keys:
247
+ continue
248
+ try:
249
+ pd.to_datetime(X[col])
250
+ maybe_date_col = col
251
+ break
252
+ except Exception:
253
+ pass
254
+
255
+ if maybe_date_col is None:
256
+ return
257
+
258
+ dates = pd.to_datetime(X[maybe_date_col]).dt.date
259
+
260
+ date_counts = dates.value_counts().sort_index()
261
+
262
+ date_counts_1 = date_counts[: round(len(date_counts) / 2)]
263
+ date_counts_2 = date_counts[round(len(date_counts) / 2) :]
264
+ ratio = date_counts_2.mean() / date_counts_1.mean()
265
+
266
+ if ratio > 1.2 or ratio < 0.8:
267
+ if warning_counter is not None:
268
+ warning_counter.increment()
269
+ if logger is None:
270
+ logger = logging.getLogger("muted_logger")
271
+ logger.setLevel("FATAL")
272
+ bundle = bundle or get_custom_bundle()
273
+ msg = bundle.get("x_unstable_by_date")
274
+ print(msg)
275
+ logger.warning(msg)
@@ -3,7 +3,7 @@ from typing import Dict, List, Optional, Union
3
3
 
4
4
  import pandas as pd
5
5
 
6
- from upgini.metadata import EVAL_SET_INDEX, SORT_ID, SYSTEM_RECORD_ID, TARGET, ModelTaskType, SearchKey
6
+ from upgini.metadata import SORT_ID, SYSTEM_RECORD_ID, TARGET, ModelTaskType, SearchKey
7
7
  from upgini.resource_bundle import ResourceBundle
8
8
  from upgini.utils.datetime_utils import DateTimeSearchKeyConverter
9
9
  from upgini.utils.target_utils import define_task
@@ -78,58 +78,20 @@ def remove_fintech_duplicates(
78
78
  rows_with_diff_target = grouped_by_personal_cols.filter(has_diff_target_within_60_days)
79
79
  if len(rows_with_diff_target) > 0:
80
80
  unique_keys_to_delete = rows_with_diff_target[personal_cols].drop_duplicates()
81
- if EVAL_SET_INDEX not in df.columns:
82
- rows_to_remove = pd.merge(df.reset_index(), unique_keys_to_delete, on=personal_cols)
83
- rows_to_remove = rows_to_remove.set_index(df.index.name or "index")
84
- perc = len(rows_to_remove) * 100 / len(df)
85
- msg = bundle.get("dataset_train_diff_target_duplicates_fintech").format(
86
- perc, len(rows_to_remove), rows_to_remove.index.to_list()
87
- )
88
- if not silent:
89
- print(msg)
90
- if logger:
91
- logger.warning(msg)
92
- logger.info(f"Dataset shape before clean fintech duplicates: {df.shape}")
93
- df = df[~df.index.isin(rows_to_remove.index)]
94
- logger.info(f"Dataset shape after clean fintech duplicates: {df.shape}")
95
- else:
96
- # Indices in train and eval_set can be the same so we remove rows from them separately
97
- train = df.query(f"{EVAL_SET_INDEX} == 0")
98
- train_rows_to_remove = pd.merge(train.reset_index(), unique_keys_to_delete, on=personal_cols)
99
- train_rows_to_remove = train_rows_to_remove.set_index(train.index.name or "index")
100
- train_perc = len(train_rows_to_remove) * 100 / len(train)
101
- msg = bundle.get("dataset_train_diff_target_duplicates_fintech").format(
102
- train_perc, len(train_rows_to_remove), train_rows_to_remove.index.to_list()
103
- )
104
- if not silent:
105
- print(msg)
106
- if logger:
107
- logger.warning(msg)
108
- logger.info(f"Train dataset shape before clean fintech duplicates: {train.shape}")
109
- train = train[~train.index.isin(train_rows_to_remove.index)]
110
- logger.info(f"Train dataset shape after clean fintech duplicates: {train.shape}")
111
-
112
- evals = [df.query(f"{EVAL_SET_INDEX} == {i}") for i in df[EVAL_SET_INDEX].unique() if i != 0]
113
- new_evals = []
114
- for i, eval in enumerate(evals):
115
- eval_rows_to_remove = pd.merge(eval.reset_index(), unique_keys_to_delete, on=personal_cols)
116
- eval_rows_to_remove = eval_rows_to_remove.set_index(eval.index.name or "index")
117
- eval_perc = len(eval_rows_to_remove) * 100 / len(eval)
118
- msg = bundle.get("dataset_eval_diff_target_duplicates_fintech").format(
119
- eval_perc, len(eval_rows_to_remove), i + 1, eval_rows_to_remove.index.to_list()
120
- )
121
- if not silent:
122
- print(msg)
123
- if logger:
124
- logger.warning(msg)
125
- logger.info(f"Eval {i + 1} dataset shape before clean fintech duplicates: {eval.shape}")
126
- eval = eval[~eval.index.isin(eval_rows_to_remove.index)]
127
- logger.info(f"Eval {i + 1} dataset shape after clean fintech duplicates: {eval.shape}")
128
- new_evals.append(eval)
129
-
130
- logger.info(f"Dataset shape before clean fintech duplicates: {df.shape}")
131
- df = pd.concat([train] + new_evals)
132
- logger.info(f"Dataset shape after clean fintech duplicates: {df.shape}")
81
+ rows_to_remove = pd.merge(df.reset_index(), unique_keys_to_delete, on=personal_cols)
82
+ rows_to_remove = rows_to_remove.set_index(df.index.name or "index")
83
+ perc = len(rows_to_remove) * 100 / len(df)
84
+ msg = bundle.get("dataset_diff_target_duplicates_fintech").format(
85
+ perc, len(rows_to_remove), rows_to_remove.index.to_list()
86
+ )
87
+ if not silent:
88
+ print(msg)
89
+ if logger:
90
+ logger.warning(msg)
91
+ logger.info(f"Dataset shape before clean fintech duplicates: {df.shape}")
92
+ df = df[~df.index.isin(rows_to_remove.index)]
93
+ logger.info(f"Dataset shape after clean fintech duplicates: {df.shape}")
94
+
133
95
  return df
134
96
 
135
97
 
@@ -139,18 +101,14 @@ def clean_full_duplicates(
139
101
  nrows = len(df)
140
102
  if nrows == 0:
141
103
  return df
142
- # Remove full duplicates (exclude system_record_id, sort_id and eval_set_index)
104
+ # Remove absolute duplicates (exclude system_record_id)
143
105
  unique_columns = df.columns.tolist()
144
106
  if SYSTEM_RECORD_ID in unique_columns:
145
107
  unique_columns.remove(SYSTEM_RECORD_ID)
146
108
  if SORT_ID in unique_columns:
147
109
  unique_columns.remove(SORT_ID)
148
- if EVAL_SET_INDEX in unique_columns:
149
- unique_columns.remove(EVAL_SET_INDEX)
150
110
  logger.info(f"Dataset shape before clean duplicates: {df.shape}")
151
- # Train segment goes first so if duplicates are found in train and eval set
152
- # then we keep unique rows in train segment
153
- df = df.drop_duplicates(subset=unique_columns, keep="first")
111
+ df = df.drop_duplicates(subset=unique_columns)
154
112
  logger.info(f"Dataset shape after clean duplicates: {df.shape}")
155
113
  nrows_after_full_dedup = len(df)
156
114
  share_full_dedup = 100 * (1 - nrows_after_full_dedup / nrows)
@@ -165,7 +123,7 @@ def clean_full_duplicates(
165
123
  marked_duplicates = df.duplicated(subset=unique_columns, keep=False)
166
124
  if marked_duplicates.sum() > 0:
167
125
  dups_indices = df[marked_duplicates].index.to_list()
168
- nrows_after_tgt_dedup = len(df.drop_duplicates(subset=unique_columns, keep=False))
126
+ nrows_after_tgt_dedup = len(df.drop_duplicates(subset=unique_columns))
169
127
  num_dup_rows = nrows_after_full_dedup - nrows_after_tgt_dedup
170
128
  share_tgt_dedup = 100 * num_dup_rows / nrows_after_full_dedup
171
129
 
@@ -175,7 +133,6 @@ def clean_full_duplicates(
175
133
  print(msg)
176
134
  df = df.drop_duplicates(subset=unique_columns, keep=False)
177
135
  logger.info(f"Dataset shape after clean invalid target duplicates: {df.shape}")
178
-
179
136
  return df
180
137
 
181
138
 
@@ -132,9 +132,7 @@ def balance_undersample(
132
132
  class_value = classes[class_idx]
133
133
  class_count = vc[class_value]
134
134
  sample_strategy[class_value] = min(class_count, quantile25_class_cnt * multiclass_bootstrap_loops)
135
- sampler = RandomUnderSampler(
136
- sampling_strategy=sample_strategy, random_state=random_state
137
- )
135
+ sampler = RandomUnderSampler(sampling_strategy=sample_strategy, random_state=random_state)
138
136
  X = df[SYSTEM_RECORD_ID]
139
137
  X = X.to_frame(SYSTEM_RECORD_ID)
140
138
  new_x, _ = sampler.fit_resample(X, target) # type: ignore
@@ -153,9 +151,7 @@ def balance_undersample(
153
151
  minority_class = df[df[target_column] == min_class_value]
154
152
  majority_class = df[df[target_column] != min_class_value]
155
153
  sample_size = min(len(majority_class), min_sample_threshold - min_class_count)
156
- sampled_majority_class = majority_class.sample(
157
- n=sample_size, random_state=random_state
158
- )
154
+ sampled_majority_class = majority_class.sample(n=sample_size, random_state=random_state)
159
155
  resampled_data = df[
160
156
  (df[SYSTEM_RECORD_ID].isin(minority_class[SYSTEM_RECORD_ID]))
161
157
  | (df[SYSTEM_RECORD_ID].isin(sampled_majority_class[SYSTEM_RECORD_ID]))
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: upgini
3
- Version: 1.1.263a1
3
+ Version: 1.1.264
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Home-page: https://upgini.com/
6
6
  Author: Upgini Developers
@@ -24,6 +24,7 @@ src/upgini/ads_management/ads_manager.py
24
24
  src/upgini/autofe/__init__.py
25
25
  src/upgini/autofe/all_operands.py
26
26
  src/upgini/autofe/binary.py
27
+ src/upgini/autofe/date.py
27
28
  src/upgini/autofe/feature.py
28
29
  src/upgini/autofe/groupby.py
29
30
  src/upgini/autofe/operand.py
@@ -64,6 +65,7 @@ src/upgini/utils/sklearn_ext.py
64
65
  src/upgini/utils/target_utils.py
65
66
  src/upgini/utils/track_info.py
66
67
  src/upgini/utils/warning_counter.py
68
+ tests/test_autofe_operands.py
67
69
  tests/test_binary_dataset.py
68
70
  tests/test_blocked_time_series.py
69
71
  tests/test_categorical_dataset.py
@@ -0,0 +1,27 @@
1
+ import pandas as pd
2
+ from upgini.autofe.date import DateDiff, DateDiffType2
3
+
4
+ from datetime import datetime
5
+ from pandas.testing import assert_series_equal
6
+
7
+
8
+ def test_date_diff():
9
+ df = pd.DataFrame(
10
+ [[datetime(1993, 12, 10), datetime(2022, 10, 10)], [datetime(2023, 10, 10), datetime(2022, 10, 10)]],
11
+ columns=["date1", "date2"],
12
+ )
13
+
14
+ operand = DateDiff()
15
+ expected_result = pd.Series([10531, None])
16
+ assert_series_equal(operand.calculate_binary(df.date2, df.date1), expected_result)
17
+
18
+
19
+ def test_date_diff_future():
20
+ df = pd.DataFrame(
21
+ [[datetime(1993, 12, 10), datetime(2022, 10, 10)], [datetime(1993, 4, 10), datetime(2022, 10, 10)]],
22
+ columns=["date1", "date2"],
23
+ )
24
+
25
+ operand = DateDiffType2()
26
+ expected_result = pd.Series([61.0, 182.0])
27
+ assert_series_equal(operand.calculate_binary(df.date2, df.date1), expected_result)
@@ -1,7 +1,13 @@
1
- import pandas as pd
2
1
  import numpy as np
2
+ import pandas as pd
3
3
 
4
- from upgini.utils.datetime_utils import is_blocked_time_series, is_time_series
4
+ from upgini.metadata import SearchKey
5
+ from upgini.utils.datetime_utils import (
6
+ is_blocked_time_series,
7
+ is_time_series,
8
+ validate_dates_distribution,
9
+ )
10
+ from upgini.utils.warning_counter import WarningCounter
5
11
 
6
12
  pd.set_option("mode.chained_assignment", "raise")
7
13
 
@@ -183,3 +189,25 @@ def test_multivariate_time_series():
183
189
  assert not is_blocked_time_series(df, "date", ["date"])
184
190
 
185
191
  assert is_blocked_time_series(df, "date", ["date", "feature3"])
192
+
193
+
194
+ def test_validate_dates_distribution():
195
+ df = pd.DataFrame({"date": ["2020-01-01"] * 10 + ["2020-02-01"] * 20 + ["2020-03-01"] * 30 + ["2020-04-01"] * 40})
196
+ warning_counter = WarningCounter()
197
+ validate_dates_distribution(df, {}, warning_counter=warning_counter)
198
+ assert warning_counter.has_warnings()
199
+
200
+ df = pd.DataFrame({"date": ["2020-05-01"] * 10 + ["2020-02-01"] * 20 + ["2020-03-01"] * 30 + ["2020-04-01"] * 40})
201
+ warning_counter = WarningCounter()
202
+ validate_dates_distribution(df, {}, warning_counter=warning_counter)
203
+ assert not warning_counter.has_warnings()
204
+
205
+ df = pd.DataFrame(
206
+ {
207
+ "date2": ["2020-05-01"] * 10 + ["2020-02-01"] * 20 + ["2020-03-01"] * 30 + ["2020-04-01"] * 40,
208
+ "date1": ["2020-01-01"] * 10 + ["2020-02-01"] * 20 + ["2020-03-01"] * 30 + ["2020-04-01"] * 40,
209
+ }
210
+ )
211
+ warning_counter = WarningCounter()
212
+ validate_dates_distribution(df, {"date1": SearchKey.DATE}, warning_counter=warning_counter)
213
+ assert warning_counter.has_warnings()
@@ -2164,6 +2164,8 @@ def test_idempotent_order_with_imbalanced_dataset(requests_mock: Mocker):
2164
2164
 
2165
2165
  actual_result_df = result_wrapper.df.sort_values(by="system_record_id").reset_index(drop=True)
2166
2166
  # actual_result_df.to_parquet(expected_result_path)
2167
+ actual_result_df["phone_num_a54a33"] = actual_result_df["phone_num_a54a33"].astype("Int64")
2168
+ actual_result_df["rep_date_f5d6bb"] = actual_result_df["rep_date_f5d6bb"].astype("Int64")
2167
2169
  assert_frame_equal(actual_result_df, expected_result_df)
2168
2170
 
2169
2171
  for i in range(5):
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes