upgini 1.1.263a1__tar.gz → 1.1.264a1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

Files changed (85) hide show
  1. {upgini-1.1.263a1/src/upgini.egg-info → upgini-1.1.264a1}/PKG-INFO +1 -1
  2. {upgini-1.1.263a1 → upgini-1.1.264a1}/setup.py +1 -1
  3. {upgini-1.1.263a1 → upgini-1.1.264a1}/src/upgini/autofe/all_operands.py +3 -0
  4. upgini-1.1.264a1/src/upgini/autofe/date.py +53 -0
  5. {upgini-1.1.263a1 → upgini-1.1.264a1}/src/upgini/autofe/feature.py +1 -1
  6. {upgini-1.1.263a1 → upgini-1.1.264a1}/src/upgini/autofe/operand.py +2 -0
  7. {upgini-1.1.263a1 → upgini-1.1.264a1}/src/upgini/autofe/unary.py +15 -8
  8. {upgini-1.1.263a1 → upgini-1.1.264a1}/src/upgini/autofe/vector.py +5 -3
  9. {upgini-1.1.263a1 → upgini-1.1.264a1/src/upgini.egg-info}/PKG-INFO +1 -1
  10. {upgini-1.1.263a1 → upgini-1.1.264a1}/src/upgini.egg-info/SOURCES.txt +2 -0
  11. upgini-1.1.264a1/tests/test_autofe_operands.py +27 -0
  12. {upgini-1.1.263a1 → upgini-1.1.264a1}/LICENSE +0 -0
  13. {upgini-1.1.263a1 → upgini-1.1.264a1}/README.md +0 -0
  14. {upgini-1.1.263a1 → upgini-1.1.264a1}/pyproject.toml +0 -0
  15. {upgini-1.1.263a1 → upgini-1.1.264a1}/setup.cfg +0 -0
  16. {upgini-1.1.263a1 → upgini-1.1.264a1}/src/upgini/__init__.py +0 -0
  17. {upgini-1.1.263a1 → upgini-1.1.264a1}/src/upgini/ads.py +0 -0
  18. {upgini-1.1.263a1 → upgini-1.1.264a1}/src/upgini/ads_management/__init__.py +0 -0
  19. {upgini-1.1.263a1 → upgini-1.1.264a1}/src/upgini/ads_management/ads_manager.py +0 -0
  20. {upgini-1.1.263a1 → upgini-1.1.264a1}/src/upgini/autofe/__init__.py +0 -0
  21. {upgini-1.1.263a1 → upgini-1.1.264a1}/src/upgini/autofe/binary.py +0 -0
  22. {upgini-1.1.263a1 → upgini-1.1.264a1}/src/upgini/autofe/groupby.py +0 -0
  23. {upgini-1.1.263a1 → upgini-1.1.264a1}/src/upgini/data_source/__init__.py +0 -0
  24. {upgini-1.1.263a1 → upgini-1.1.264a1}/src/upgini/data_source/data_source_publisher.py +0 -0
  25. {upgini-1.1.263a1 → upgini-1.1.264a1}/src/upgini/dataset.py +0 -0
  26. {upgini-1.1.263a1 → upgini-1.1.264a1}/src/upgini/errors.py +0 -0
  27. {upgini-1.1.263a1 → upgini-1.1.264a1}/src/upgini/features_enricher.py +0 -0
  28. {upgini-1.1.263a1 → upgini-1.1.264a1}/src/upgini/fingerprint.js +0 -0
  29. {upgini-1.1.263a1 → upgini-1.1.264a1}/src/upgini/http.py +0 -0
  30. {upgini-1.1.263a1 → upgini-1.1.264a1}/src/upgini/mdc/__init__.py +0 -0
  31. {upgini-1.1.263a1 → upgini-1.1.264a1}/src/upgini/mdc/context.py +0 -0
  32. {upgini-1.1.263a1 → upgini-1.1.264a1}/src/upgini/metadata.py +0 -0
  33. {upgini-1.1.263a1 → upgini-1.1.264a1}/src/upgini/metrics.py +0 -0
  34. {upgini-1.1.263a1 → upgini-1.1.264a1}/src/upgini/normalizer/__init__.py +0 -0
  35. {upgini-1.1.263a1 → upgini-1.1.264a1}/src/upgini/normalizer/phone_normalizer.py +0 -0
  36. {upgini-1.1.263a1 → upgini-1.1.264a1}/src/upgini/resource_bundle/__init__.py +0 -0
  37. {upgini-1.1.263a1 → upgini-1.1.264a1}/src/upgini/resource_bundle/exceptions.py +0 -0
  38. {upgini-1.1.263a1 → upgini-1.1.264a1}/src/upgini/resource_bundle/strings.properties +0 -0
  39. {upgini-1.1.263a1 → upgini-1.1.264a1}/src/upgini/resource_bundle/strings_widget.properties +0 -0
  40. {upgini-1.1.263a1 → upgini-1.1.264a1}/src/upgini/sampler/__init__.py +0 -0
  41. {upgini-1.1.263a1 → upgini-1.1.264a1}/src/upgini/sampler/base.py +0 -0
  42. {upgini-1.1.263a1 → upgini-1.1.264a1}/src/upgini/sampler/random_under_sampler.py +0 -0
  43. {upgini-1.1.263a1 → upgini-1.1.264a1}/src/upgini/sampler/utils.py +0 -0
  44. {upgini-1.1.263a1 → upgini-1.1.264a1}/src/upgini/search_task.py +0 -0
  45. {upgini-1.1.263a1 → upgini-1.1.264a1}/src/upgini/spinner.py +0 -0
  46. {upgini-1.1.263a1 → upgini-1.1.264a1}/src/upgini/utils/__init__.py +0 -0
  47. {upgini-1.1.263a1 → upgini-1.1.264a1}/src/upgini/utils/base_search_key_detector.py +0 -0
  48. {upgini-1.1.263a1 → upgini-1.1.264a1}/src/upgini/utils/blocked_time_series.py +0 -0
  49. {upgini-1.1.263a1 → upgini-1.1.264a1}/src/upgini/utils/country_utils.py +0 -0
  50. {upgini-1.1.263a1 → upgini-1.1.264a1}/src/upgini/utils/custom_loss_utils.py +0 -0
  51. {upgini-1.1.263a1 → upgini-1.1.264a1}/src/upgini/utils/cv_utils.py +0 -0
  52. {upgini-1.1.263a1 → upgini-1.1.264a1}/src/upgini/utils/datetime_utils.py +0 -0
  53. {upgini-1.1.263a1 → upgini-1.1.264a1}/src/upgini/utils/deduplicate_utils.py +0 -0
  54. {upgini-1.1.263a1 → upgini-1.1.264a1}/src/upgini/utils/display_utils.py +0 -0
  55. {upgini-1.1.263a1 → upgini-1.1.264a1}/src/upgini/utils/email_utils.py +0 -0
  56. {upgini-1.1.263a1 → upgini-1.1.264a1}/src/upgini/utils/fallback_progress_bar.py +0 -0
  57. {upgini-1.1.263a1 → upgini-1.1.264a1}/src/upgini/utils/features_validator.py +0 -0
  58. {upgini-1.1.263a1 → upgini-1.1.264a1}/src/upgini/utils/format.py +0 -0
  59. {upgini-1.1.263a1 → upgini-1.1.264a1}/src/upgini/utils/ip_utils.py +0 -0
  60. {upgini-1.1.263a1 → upgini-1.1.264a1}/src/upgini/utils/phone_utils.py +0 -0
  61. {upgini-1.1.263a1 → upgini-1.1.264a1}/src/upgini/utils/postal_code_utils.py +0 -0
  62. {upgini-1.1.263a1 → upgini-1.1.264a1}/src/upgini/utils/progress_bar.py +0 -0
  63. {upgini-1.1.263a1 → upgini-1.1.264a1}/src/upgini/utils/sklearn_ext.py +0 -0
  64. {upgini-1.1.263a1 → upgini-1.1.264a1}/src/upgini/utils/target_utils.py +0 -0
  65. {upgini-1.1.263a1 → upgini-1.1.264a1}/src/upgini/utils/track_info.py +0 -0
  66. {upgini-1.1.263a1 → upgini-1.1.264a1}/src/upgini/utils/warning_counter.py +0 -0
  67. {upgini-1.1.263a1 → upgini-1.1.264a1}/src/upgini/version_validator.py +0 -0
  68. {upgini-1.1.263a1 → upgini-1.1.264a1}/src/upgini.egg-info/dependency_links.txt +0 -0
  69. {upgini-1.1.263a1 → upgini-1.1.264a1}/src/upgini.egg-info/requires.txt +0 -0
  70. {upgini-1.1.263a1 → upgini-1.1.264a1}/src/upgini.egg-info/top_level.txt +0 -0
  71. {upgini-1.1.263a1 → upgini-1.1.264a1}/tests/test_binary_dataset.py +0 -0
  72. {upgini-1.1.263a1 → upgini-1.1.264a1}/tests/test_blocked_time_series.py +0 -0
  73. {upgini-1.1.263a1 → upgini-1.1.264a1}/tests/test_categorical_dataset.py +0 -0
  74. {upgini-1.1.263a1 → upgini-1.1.264a1}/tests/test_continuous_dataset.py +0 -0
  75. {upgini-1.1.263a1 → upgini-1.1.264a1}/tests/test_country_utils.py +0 -0
  76. {upgini-1.1.263a1 → upgini-1.1.264a1}/tests/test_custom_loss_utils.py +0 -0
  77. {upgini-1.1.263a1 → upgini-1.1.264a1}/tests/test_datetime_utils.py +0 -0
  78. {upgini-1.1.263a1 → upgini-1.1.264a1}/tests/test_email_utils.py +0 -0
  79. {upgini-1.1.263a1 → upgini-1.1.264a1}/tests/test_etalon_validation.py +0 -0
  80. {upgini-1.1.263a1 → upgini-1.1.264a1}/tests/test_features_enricher.py +0 -0
  81. {upgini-1.1.263a1 → upgini-1.1.264a1}/tests/test_metrics.py +0 -0
  82. {upgini-1.1.263a1 → upgini-1.1.264a1}/tests/test_phone_utils.py +0 -0
  83. {upgini-1.1.263a1 → upgini-1.1.264a1}/tests/test_postal_code_utils.py +0 -0
  84. {upgini-1.1.263a1 → upgini-1.1.264a1}/tests/test_target_utils.py +0 -0
  85. {upgini-1.1.263a1 → upgini-1.1.264a1}/tests/test_widget.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: upgini
3
- Version: 1.1.263a1
3
+ Version: 1.1.264a1
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Home-page: https://upgini.com/
6
6
  Author: Upgini Developers
@@ -40,7 +40,7 @@ def send_log(msg: str):
40
40
 
41
41
 
42
42
  here = Path(__file__).parent.resolve()
43
- version = "1.1.263a1"
43
+ version = "1.1.264a1"
44
44
  try:
45
45
  send_log(f"Start setup PyLib version {version}")
46
46
  setup(
@@ -1,4 +1,5 @@
1
1
  from typing import Dict
2
+ from upgini.autofe.date import DateDiff, DateDiffType2
2
3
  from upgini.autofe.groupby import GroupByThenAgg, GroupByThenRank
3
4
  from upgini.autofe.operand import Operand
4
5
  from upgini.autofe.unary import Abs, Log, Residual, Sqrt, Square, Sigmoid, Floor, Freq
@@ -35,6 +36,8 @@ ALL_OPERANDS: Dict[str, Operand] = {
35
36
  Operand(name="GroupByThenNUnique", output_type="int", is_vectorizable=True, is_grouping=True),
36
37
  Operand(name="GroupByThenFreq", output_type="float", is_grouping=True),
37
38
  Sim(),
39
+ DateDiff(),
40
+ DateDiffType2(),
38
41
  ]
39
42
  }
40
43
 
@@ -0,0 +1,53 @@
1
+ from typing import Optional, Union
2
+ import numpy as np
3
+ import pandas as pd
4
+
5
+ from upgini.autofe.operand import PandasOperand
6
+
7
+
8
+ class DateDiffMixin:
9
+ diff_unit: str = "D"
10
+ left_unit: Optional[str] = None
11
+ right_unit: Optional[str] = None
12
+
13
+ def _convert_to_date(
14
+ self, x: Union[pd.DataFrame, pd.Series], unit: Optional[str]
15
+ ) -> Union[pd.DataFrame, pd.Series]:
16
+ if isinstance(x, pd.DataFrame):
17
+ return x.apply(lambda y: self._convert_to_date(y, unit), axis=1)
18
+
19
+ return pd.to_datetime(x, unit=unit)
20
+
21
+
22
+ class DateDiff(PandasOperand, DateDiffMixin):
23
+ name = "date_diff"
24
+ is_binary = True
25
+ has_symmetry_importance = True
26
+
27
+ def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
28
+ left = self._convert_to_date(left, self.left_unit)
29
+ right = self._convert_to_date(right, self.right_unit)
30
+ return self.__replace_negative((left - right) / np.timedelta64(1, self.diff_unit))
31
+
32
+ def __replace_negative(self, x: Union[pd.DataFrame, pd.Series]):
33
+ x[x < 0] = None
34
+ return x
35
+
36
+
37
+ class DateDiffType2(PandasOperand, DateDiffMixin):
38
+ name = "date_diff_type2"
39
+ is_binary = True
40
+ has_symmetry_importance = True
41
+ is_vectorizable = False
42
+
43
+ def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
44
+ left = self._convert_to_date(left, self.left_unit)
45
+ right = self._convert_to_date(right, self.right_unit)
46
+ future = right + (left.dt.year - right.dt.year).apply(
47
+ lambda y: np.datetime64("NaT") if np.isnan(y) else pd.tseries.offsets.DateOffset(years=y)
48
+ )
49
+ before = future[future < left]
50
+ future[future < left] = before + pd.tseries.offsets.DateOffset(years=1)
51
+ diff = (future - left) / np.timedelta64(1, self.diff_unit)
52
+
53
+ return diff
@@ -305,7 +305,7 @@ class FeatureGroup:
305
305
  grouped_features = []
306
306
 
307
307
  def groupby_func(f: Feature) -> Tuple[Operand, Union[Column, Feature]]:
308
- return (f.op, f.children[0] if f.op.is_unary or f.op.is_vector else f.children[1])
308
+ return (f.op, f.children[0 if not f.op.is_vectorizable else f.op.group_index])
309
309
 
310
310
  for op_child, features in itertools.groupby(candidates, groupby_func):
311
311
  op, main_child = op_child
@@ -73,6 +73,8 @@ class PandasOperand(Operand, abc.ABC):
73
73
 
74
74
 
75
75
  class VectorizableMixin(Operand):
76
+ group_index: int = 1
77
+
76
78
  def validate_calculation(self, input_columns: List[str], **kwargs) -> Tuple[str, List[str]]:
77
79
  if not kwargs.get(MAIN_COLUMN):
78
80
  raise ValueError(f"Expected argument {MAIN_COLUMN} for grouping operator {self.name} not found")
@@ -1,12 +1,13 @@
1
- from upgini.autofe.operand import PandasOperand
1
+ from upgini.autofe.operand import PandasOperand, VectorizableMixin
2
2
  import numpy as np
3
3
  import pandas as pd
4
4
 
5
5
 
6
- class Abs(PandasOperand):
6
+ class Abs(PandasOperand, VectorizableMixin):
7
7
  name = "abs"
8
8
  is_unary = True
9
9
  is_vectorizable = True
10
+ group_index = 0
10
11
 
11
12
  def calculate_unary(self, data: pd.Series) -> pd.Series:
12
13
  return data.abs()
@@ -15,11 +16,12 @@ class Abs(PandasOperand):
15
16
  return data.abs()
16
17
 
17
18
 
18
- class Log(PandasOperand):
19
+ class Log(PandasOperand, VectorizableMixin):
19
20
  name = "log"
20
21
  is_unary = True
21
22
  is_vectorizable = True
22
23
  output_type = "float"
24
+ group_index = 0
23
25
 
24
26
  def calculate_unary(self, data: pd.Series) -> pd.Series:
25
27
  return self._round_value(np.log(np.abs(data.replace(0, np.nan))), 10)
@@ -28,11 +30,12 @@ class Log(PandasOperand):
28
30
  return self._round_value(np.log(data.replace(0, np.nan).abs()), 10)
29
31
 
30
32
 
31
- class Sqrt(PandasOperand):
33
+ class Sqrt(PandasOperand, VectorizableMixin):
32
34
  name = "sqrt"
33
35
  is_unary = True
34
36
  is_vectorizable = True
35
37
  output_type = "float"
38
+ group_index = 0
36
39
 
37
40
  def calculate_unary(self, data: pd.Series) -> pd.Series:
38
41
  return self._round_value(np.sqrt(np.abs(data)))
@@ -41,10 +44,11 @@ class Sqrt(PandasOperand):
41
44
  return self._round_value(np.sqrt(data.abs()))
42
45
 
43
46
 
44
- class Square(PandasOperand):
47
+ class Square(PandasOperand, VectorizableMixin):
45
48
  name = "square"
46
49
  is_unary = True
47
50
  is_vectorizable = True
51
+ group_index = 0
48
52
 
49
53
  def calculate_unary(self, data: pd.Series) -> pd.Series:
50
54
  return np.square(data)
@@ -53,11 +57,12 @@ class Square(PandasOperand):
53
57
  return np.square(data)
54
58
 
55
59
 
56
- class Sigmoid(PandasOperand):
60
+ class Sigmoid(PandasOperand, VectorizableMixin):
57
61
  name = "sigmoid"
58
62
  is_unary = True
59
63
  is_vectorizable = True
60
64
  output_type = "float"
65
+ group_index = 0
61
66
 
62
67
  def calculate_unary(self, data: pd.Series) -> pd.Series:
63
68
  return self._round_value(1 / (1 + np.exp(-data)))
@@ -66,12 +71,13 @@ class Sigmoid(PandasOperand):
66
71
  return self._round_value(1 / (1 + np.exp(-data)))
67
72
 
68
73
 
69
- class Floor(PandasOperand):
74
+ class Floor(PandasOperand, VectorizableMixin):
70
75
  name = "floor"
71
76
  is_unary = True
72
77
  is_vectorizable = True
73
78
  output_type = "int"
74
79
  input_type = "continuous"
80
+ group_index = 0
75
81
 
76
82
  def calculate_unary(self, data: pd.Series) -> pd.Series:
77
83
  return np.floor(data)
@@ -80,11 +86,12 @@ class Floor(PandasOperand):
80
86
  return np.floor(data)
81
87
 
82
88
 
83
- class Residual(PandasOperand):
89
+ class Residual(PandasOperand, VectorizableMixin):
84
90
  name = "residual"
85
91
  is_unary = True
86
92
  is_vectorizable = True
87
93
  input_type = "continuous"
94
+ group_index = 0
88
95
 
89
96
  def calculate_unary(self, data: pd.Series) -> pd.Series:
90
97
  return data - np.floor(data)
@@ -1,20 +1,22 @@
1
1
  from typing import List
2
2
  import pandas as pd
3
- from upgini.autofe.operand import PandasOperand
3
+ from upgini.autofe.operand import PandasOperand, VectorizableMixin
4
4
 
5
5
 
6
- class Mean(PandasOperand):
6
+ class Mean(PandasOperand, VectorizableMixin):
7
7
  name = "mean"
8
8
  output_type = "float"
9
9
  is_vector = True
10
+ group_index = 0
10
11
 
11
12
  def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
12
13
  return pd.DataFrame(data).T.fillna(0).mean(axis=1)
13
14
 
14
15
 
15
- class Sum(PandasOperand):
16
+ class Sum(PandasOperand, VectorizableMixin):
16
17
  name = "sum"
17
18
  is_vector = True
19
+ group_index = 0
18
20
 
19
21
  def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
20
22
  return pd.DataFrame(data).T.fillna(0).sum(axis=1)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: upgini
3
- Version: 1.1.263a1
3
+ Version: 1.1.264a1
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Home-page: https://upgini.com/
6
6
  Author: Upgini Developers
@@ -24,6 +24,7 @@ src/upgini/ads_management/ads_manager.py
24
24
  src/upgini/autofe/__init__.py
25
25
  src/upgini/autofe/all_operands.py
26
26
  src/upgini/autofe/binary.py
27
+ src/upgini/autofe/date.py
27
28
  src/upgini/autofe/feature.py
28
29
  src/upgini/autofe/groupby.py
29
30
  src/upgini/autofe/operand.py
@@ -64,6 +65,7 @@ src/upgini/utils/sklearn_ext.py
64
65
  src/upgini/utils/target_utils.py
65
66
  src/upgini/utils/track_info.py
66
67
  src/upgini/utils/warning_counter.py
68
+ tests/test_autofe_operands.py
67
69
  tests/test_binary_dataset.py
68
70
  tests/test_blocked_time_series.py
69
71
  tests/test_categorical_dataset.py
@@ -0,0 +1,27 @@
1
+ import pandas as pd
2
+ from upgini.autofe.date import DateDiff, DateDiffType2
3
+
4
+ from datetime import datetime
5
+ from pandas.testing import assert_series_equal
6
+
7
+
8
+ def test_date_diff():
9
+ df = pd.DataFrame(
10
+ [[datetime(1993, 12, 10), datetime(2022, 10, 10)], [datetime(2023, 10, 10), datetime(2022, 10, 10)]],
11
+ columns=["date1", "date2"],
12
+ )
13
+
14
+ operand = DateDiff()
15
+ expected_result = pd.Series([10531, None])
16
+ assert_series_equal(operand.calculate_binary(df.date2, df.date1), expected_result)
17
+
18
+
19
+ def test_date_diff_future():
20
+ df = pd.DataFrame(
21
+ [[datetime(1993, 12, 10), datetime(2022, 10, 10)], [datetime(1993, 4, 10), datetime(2022, 10, 10)]],
22
+ columns=["date1", "date2"],
23
+ )
24
+
25
+ operand = DateDiffType2()
26
+ expected_result = pd.Series([61.0, 182.0])
27
+ assert_series_equal(operand.calculate_binary(df.date2, df.date1), expected_result)
File without changes
File without changes
File without changes
File without changes
File without changes