upgini 1.1.229a3__tar.gz → 1.1.230__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

Files changed (79) hide show
  1. {upgini-1.1.229a3/src/upgini.egg-info → upgini-1.1.230}/PKG-INFO +1 -1
  2. {upgini-1.1.229a3 → upgini-1.1.230}/setup.py +1 -1
  3. upgini-1.1.230/src/upgini/autofe/all_operands.py +43 -0
  4. upgini-1.1.230/src/upgini/autofe/binary.py +133 -0
  5. upgini-1.1.230/src/upgini/autofe/feature.py +298 -0
  6. upgini-1.1.230/src/upgini/autofe/groupby.py +82 -0
  7. upgini-1.1.230/src/upgini/autofe/operand.py +70 -0
  8. upgini-1.1.230/src/upgini/autofe/unary.py +105 -0
  9. upgini-1.1.230/src/upgini/autofe/vector.py +20 -0
  10. {upgini-1.1.229a3 → upgini-1.1.230}/src/upgini/features_enricher.py +1 -2
  11. {upgini-1.1.229a3 → upgini-1.1.230}/src/upgini/metadata.py +1 -1
  12. upgini-1.1.230/src/upgini/sampler/__init__.py +0 -0
  13. {upgini-1.1.229a3 → upgini-1.1.230/src/upgini.egg-info}/PKG-INFO +1 -1
  14. {upgini-1.1.229a3 → upgini-1.1.230}/src/upgini.egg-info/SOURCES.txt +8 -0
  15. {upgini-1.1.229a3 → upgini-1.1.230}/LICENSE +0 -0
  16. {upgini-1.1.229a3 → upgini-1.1.230}/README.md +0 -0
  17. {upgini-1.1.229a3 → upgini-1.1.230}/pyproject.toml +0 -0
  18. {upgini-1.1.229a3 → upgini-1.1.230}/setup.cfg +0 -0
  19. {upgini-1.1.229a3 → upgini-1.1.230}/src/upgini/__init__.py +0 -0
  20. {upgini-1.1.229a3 → upgini-1.1.230}/src/upgini/ads.py +0 -0
  21. {upgini-1.1.229a3 → upgini-1.1.230}/src/upgini/ads_management/__init__.py +0 -0
  22. {upgini-1.1.229a3 → upgini-1.1.230}/src/upgini/ads_management/ads_manager.py +0 -0
  23. {upgini-1.1.229a3/src/upgini/data_source → upgini-1.1.230/src/upgini/autofe}/__init__.py +0 -0
  24. {upgini-1.1.229a3/src/upgini/normalizer → upgini-1.1.230/src/upgini/data_source}/__init__.py +0 -0
  25. {upgini-1.1.229a3 → upgini-1.1.230}/src/upgini/data_source/data_source_publisher.py +0 -0
  26. {upgini-1.1.229a3 → upgini-1.1.230}/src/upgini/dataset.py +0 -0
  27. {upgini-1.1.229a3 → upgini-1.1.230}/src/upgini/errors.py +0 -0
  28. {upgini-1.1.229a3 → upgini-1.1.230}/src/upgini/http.py +0 -0
  29. {upgini-1.1.229a3 → upgini-1.1.230}/src/upgini/mdc/__init__.py +0 -0
  30. {upgini-1.1.229a3 → upgini-1.1.230}/src/upgini/mdc/context.py +0 -0
  31. {upgini-1.1.229a3 → upgini-1.1.230}/src/upgini/metrics.py +0 -0
  32. {upgini-1.1.229a3/src/upgini/sampler → upgini-1.1.230/src/upgini/normalizer}/__init__.py +0 -0
  33. {upgini-1.1.229a3 → upgini-1.1.230}/src/upgini/normalizer/phone_normalizer.py +0 -0
  34. {upgini-1.1.229a3 → upgini-1.1.230}/src/upgini/resource_bundle/__init__.py +0 -0
  35. {upgini-1.1.229a3 → upgini-1.1.230}/src/upgini/resource_bundle/exceptions.py +0 -0
  36. {upgini-1.1.229a3 → upgini-1.1.230}/src/upgini/resource_bundle/strings.properties +0 -0
  37. {upgini-1.1.229a3 → upgini-1.1.230}/src/upgini/sampler/base.py +0 -0
  38. {upgini-1.1.229a3 → upgini-1.1.230}/src/upgini/sampler/random_under_sampler.py +0 -0
  39. {upgini-1.1.229a3 → upgini-1.1.230}/src/upgini/sampler/utils.py +0 -0
  40. {upgini-1.1.229a3 → upgini-1.1.230}/src/upgini/search_task.py +0 -0
  41. {upgini-1.1.229a3 → upgini-1.1.230}/src/upgini/spinner.py +0 -0
  42. {upgini-1.1.229a3 → upgini-1.1.230}/src/upgini/utils/__init__.py +0 -0
  43. {upgini-1.1.229a3 → upgini-1.1.230}/src/upgini/utils/base_search_key_detector.py +0 -0
  44. {upgini-1.1.229a3 → upgini-1.1.230}/src/upgini/utils/blocked_time_series.py +0 -0
  45. {upgini-1.1.229a3 → upgini-1.1.230}/src/upgini/utils/country_utils.py +0 -0
  46. {upgini-1.1.229a3 → upgini-1.1.230}/src/upgini/utils/custom_loss_utils.py +0 -0
  47. {upgini-1.1.229a3 → upgini-1.1.230}/src/upgini/utils/cv_utils.py +0 -0
  48. {upgini-1.1.229a3 → upgini-1.1.230}/src/upgini/utils/datetime_utils.py +0 -0
  49. {upgini-1.1.229a3 → upgini-1.1.230}/src/upgini/utils/display_utils.py +0 -0
  50. {upgini-1.1.229a3 → upgini-1.1.230}/src/upgini/utils/email_utils.py +0 -0
  51. {upgini-1.1.229a3 → upgini-1.1.230}/src/upgini/utils/fallback_progress_bar.py +0 -0
  52. {upgini-1.1.229a3 → upgini-1.1.230}/src/upgini/utils/features_validator.py +0 -0
  53. {upgini-1.1.229a3 → upgini-1.1.230}/src/upgini/utils/format.py +0 -0
  54. {upgini-1.1.229a3 → upgini-1.1.230}/src/upgini/utils/ip_utils.py +0 -0
  55. {upgini-1.1.229a3 → upgini-1.1.230}/src/upgini/utils/phone_utils.py +0 -0
  56. {upgini-1.1.229a3 → upgini-1.1.230}/src/upgini/utils/postal_code_utils.py +0 -0
  57. {upgini-1.1.229a3 → upgini-1.1.230}/src/upgini/utils/progress_bar.py +0 -0
  58. {upgini-1.1.229a3 → upgini-1.1.230}/src/upgini/utils/sklearn_ext.py +0 -0
  59. {upgini-1.1.229a3 → upgini-1.1.230}/src/upgini/utils/target_utils.py +0 -0
  60. {upgini-1.1.229a3 → upgini-1.1.230}/src/upgini/utils/track_info.py +0 -0
  61. {upgini-1.1.229a3 → upgini-1.1.230}/src/upgini/utils/warning_counter.py +0 -0
  62. {upgini-1.1.229a3 → upgini-1.1.230}/src/upgini/version_validator.py +0 -0
  63. {upgini-1.1.229a3 → upgini-1.1.230}/src/upgini.egg-info/dependency_links.txt +0 -0
  64. {upgini-1.1.229a3 → upgini-1.1.230}/src/upgini.egg-info/requires.txt +0 -0
  65. {upgini-1.1.229a3 → upgini-1.1.230}/src/upgini.egg-info/top_level.txt +0 -0
  66. {upgini-1.1.229a3 → upgini-1.1.230}/tests/test_binary_dataset.py +0 -0
  67. {upgini-1.1.229a3 → upgini-1.1.230}/tests/test_blocked_time_series.py +0 -0
  68. {upgini-1.1.229a3 → upgini-1.1.230}/tests/test_categorical_dataset.py +0 -0
  69. {upgini-1.1.229a3 → upgini-1.1.230}/tests/test_continuous_dataset.py +0 -0
  70. {upgini-1.1.229a3 → upgini-1.1.230}/tests/test_country_utils.py +0 -0
  71. {upgini-1.1.229a3 → upgini-1.1.230}/tests/test_custom_loss_utils.py +0 -0
  72. {upgini-1.1.229a3 → upgini-1.1.230}/tests/test_datetime_utils.py +0 -0
  73. {upgini-1.1.229a3 → upgini-1.1.230}/tests/test_email_utils.py +0 -0
  74. {upgini-1.1.229a3 → upgini-1.1.230}/tests/test_etalon_validation.py +0 -0
  75. {upgini-1.1.229a3 → upgini-1.1.230}/tests/test_features_enricher.py +0 -0
  76. {upgini-1.1.229a3 → upgini-1.1.230}/tests/test_metrics.py +0 -0
  77. {upgini-1.1.229a3 → upgini-1.1.230}/tests/test_phone_utils.py +0 -0
  78. {upgini-1.1.229a3 → upgini-1.1.230}/tests/test_postal_code_utils.py +0 -0
  79. {upgini-1.1.229a3 → upgini-1.1.230}/tests/test_widget.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: upgini
3
- Version: 1.1.229a3
3
+ Version: 1.1.230
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Home-page: https://upgini.com/
6
6
  Author: Upgini Developers
@@ -40,7 +40,7 @@ def send_log(msg: str):
40
40
 
41
41
 
42
42
  here = Path(__file__).parent.resolve()
43
- version = "1.1.229a3"
43
+ version = "1.1.230"
44
44
  try:
45
45
  send_log(f"Start setup PyLib version {version}")
46
46
  setup(
@@ -0,0 +1,43 @@
1
+ from typing import Dict
2
+ from upgini.autofe.groupby import GroupByThenAgg, GroupByThenRank
3
+ from upgini.autofe.operand import Operand
4
+ from upgini.autofe.unary import Abs, Log, Residual, Sqrt, Square, Sigmoid, Floor, Freq
5
+ from upgini.autofe.binary import Min, Max, Add, Subtract, Multiply, Divide, Sim
6
+ from upgini.autofe.vector import Mean, Sum
7
+
8
+ ALL_OPERANDS: Dict[str, Operand] = {
9
+ op.name: op
10
+ for op in [
11
+ Freq(),
12
+ Mean(),
13
+ Sum(),
14
+ Abs(),
15
+ Log(),
16
+ Sqrt(),
17
+ Square(),
18
+ Sigmoid(),
19
+ Floor(),
20
+ Residual(),
21
+ Min(),
22
+ Max(),
23
+ Add(),
24
+ Subtract(),
25
+ Multiply(),
26
+ Divide(),
27
+ GroupByThenAgg(name="GroupByThenMin", agg="min"),
28
+ GroupByThenAgg(name="GroupByThenMax", agg="max"),
29
+ GroupByThenAgg(name="GroupByThenMean", agg="mean"),
30
+ GroupByThenAgg(name="GroupByThenMedian", agg="median"),
31
+ GroupByThenAgg(name="GroupByThenStd", output_type="float", agg="std"),
32
+ GroupByThenRank(),
33
+ Operand(name="Combine", has_symmetry_importance=True, output_type="object", is_categorical=True),
34
+ Operand(name="CombineThenFreq", has_symmetry_importance=True, output_type="float"),
35
+ Operand(name="GroupByThenNUnique", output_type="int", is_vectorizable=True, is_grouping=True),
36
+ Operand(name="GroupByThenFreq", output_type="float", is_grouping=True),
37
+ Sim(),
38
+ ]
39
+ }
40
+
41
+
42
+ def find_op(name):
43
+ return ALL_OPERANDS.get(name)
@@ -0,0 +1,133 @@
1
+ from upgini.autofe.operand import PandasOperand, VectorizableMixin
2
+ import numpy as np
3
+ import pandas as pd
4
+ from numpy import dot
5
+ from numpy.linalg import norm
6
+
7
+
8
+ class Min(PandasOperand):
9
+ name = "min"
10
+ is_binary = True
11
+ has_symmetry_importance = True
12
+
13
+ def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
14
+ return np.minimum(left, right)
15
+
16
+
17
+ class Max(PandasOperand):
18
+ name = "max"
19
+ is_binary = True
20
+ has_symmetry_importance = True
21
+
22
+ def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
23
+ return np.maximum(left, right)
24
+
25
+
26
+ class Add(PandasOperand, VectorizableMixin):
27
+ name = "+"
28
+ alias = "add"
29
+ is_binary = True
30
+ has_symmetry_importance = True
31
+ is_vectorizable = True
32
+
33
+ def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
34
+ return left + right
35
+
36
+ def calculate_group(self, data: pd.DataFrame, **kwargs) -> pd.DataFrame:
37
+ group_column, value_columns = self.validate_calculation(data.columns, **kwargs)
38
+ d1 = data[value_columns]
39
+ d2 = data[group_column]
40
+
41
+ return d1.add(d2, axis=0)
42
+
43
+
44
+ class Subtract(PandasOperand, VectorizableMixin):
45
+ name = "-"
46
+ alias = "sub"
47
+ is_binary = True
48
+ has_symmetry_importance = True
49
+ is_vectorizable = True
50
+
51
+ def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
52
+ return left - right
53
+
54
+ def calculate_group(self, data: pd.DataFrame, **kwargs) -> pd.DataFrame:
55
+ group_column, value_columns = self.validate_calculation(data.columns, **kwargs)
56
+ d1 = data[value_columns]
57
+ d2 = data[group_column]
58
+
59
+ return d1.sub(d2, axis=0)
60
+
61
+
62
+ class Multiply(PandasOperand, VectorizableMixin):
63
+ name = "*"
64
+ alias = "mul"
65
+ is_binary = True
66
+ has_symmetry_importance = True
67
+ is_vectorizable = True
68
+
69
+ def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
70
+ return left * right
71
+
72
+ def calculate_group(self, data: pd.DataFrame, **kwargs) -> pd.DataFrame:
73
+ group_column, value_columns = self.validate_calculation(data.columns, **kwargs)
74
+ d1 = data[value_columns]
75
+ d2 = data[group_column]
76
+
77
+ return d1.mul(d2, axis=0)
78
+
79
+
80
+ class Divide(PandasOperand, VectorizableMixin):
81
+ name = "/"
82
+ alias = "div"
83
+ is_binary = True
84
+ has_symmetry_importance = True
85
+ is_vectorizable = True
86
+ output_type = "float"
87
+
88
+ def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
89
+ return left / right.replace(0, np.nan)
90
+
91
+ def calculate_group(self, data: pd.DataFrame, **kwargs) -> pd.DataFrame:
92
+ group_column, value_columns = self.validate_calculation(data.columns, **kwargs)
93
+ d1 = data[value_columns]
94
+ d2 = data[group_column]
95
+
96
+ return d1.div(d2.replace(0, np.nan), axis=0)
97
+
98
+
99
+ class Combine(PandasOperand):
100
+ name = "Combine"
101
+ is_binary = True
102
+ has_symmetry_importance = True
103
+ output_type = "object"
104
+
105
+ def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
106
+ temp = left.astype(str) + "_" + right.astype(str)
107
+ temp[left.isna() | right.isna()] = np.nan
108
+ return pd.Series(temp, index=left.index)
109
+
110
+
111
+ class CombineThenFreq(PandasOperand):
112
+ name = "CombineThenFreq"
113
+ is_binary = True
114
+ has_symmetry_importance = True
115
+ output_type = "float"
116
+ is_distribution_dependent = True
117
+ input_type = "discrete"
118
+
119
+ def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
120
+ temp = left.astype(str) + "_" + right.astype(str)
121
+ temp[left.isna() | right.isna()] = np.nan
122
+ value_counts = temp.value_counts(normalize=True)
123
+ self._loc(temp, value_counts)
124
+
125
+
126
+ class Sim(PandasOperand):
127
+ name = "sim"
128
+ is_binary = True
129
+ output_type = "float"
130
+ has_symmetry_importance = True
131
+
132
+ def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
133
+ return dot(left, right) / (norm(left) * norm(right))
@@ -0,0 +1,298 @@
1
+ import hashlib
2
+ from typing import Dict
3
+ import numpy as np
4
+ import pandas as pd
5
+ import itertools
6
+ from upgini.autofe.operand import PandasOperand
7
+ from upgini.autofe.all_operands import (
8
+ find_op,
9
+ )
10
+
11
+
12
+ class FeatureGroup(object):
13
+ def __init__(self, op, main_column, children):
14
+ self.op = op
15
+ self.main_column_node = main_column
16
+ self.children = children
17
+ self.data = None
18
+
19
+ def get_columns(self, **kwargs):
20
+ column_list = []
21
+ seen = set()
22
+ for child in self.children:
23
+ columns = child.get_columns(**kwargs)
24
+ column_list.extend([f for f in columns if f not in seen])
25
+ seen.update(columns)
26
+ return column_list
27
+
28
+ def get_display_names(self, **kwargs):
29
+ names = [f.get_display_name(**kwargs) for f in self.children]
30
+ return names
31
+
32
+ def calculate(self, data: pd.DataFrame, is_root=False):
33
+ main_column = None if self.main_column_node is None else self.main_column_node.get_columns()[0]
34
+ if isinstance(self.op, PandasOperand):
35
+ columns = self.get_columns()
36
+ new_data = self.op.calculate_group(data[columns], main_column=main_column)
37
+ new_data.rename(columns=dict(zip(columns, self.get_display_names())), inplace=True)
38
+
39
+ else:
40
+ raise NotImplementedError(f"Unrecognized operator {self.op.name}.")
41
+
42
+ new_data.replace([-np.inf, np.inf], np.nan, inplace=True)
43
+
44
+ if is_root:
45
+ self.data = new_data
46
+ return new_data
47
+
48
+ @staticmethod
49
+ def make_groups(candidates):
50
+ grouped_features = []
51
+ for op_child, features in itertools.groupby(
52
+ candidates, lambda f: (f.op, f.children[0] if f.op.is_unary or f.op.is_vector else f.children[1])
53
+ ):
54
+ op, main_child = op_child
55
+ feature_list = list(features)
56
+ if op.is_vectorizable:
57
+ if op.is_unary:
58
+ group = FeatureGroup(op, main_column=None, children=feature_list)
59
+ else:
60
+ group = FeatureGroup(op, main_column=main_child, children=feature_list)
61
+ grouped_features.append(group)
62
+ else:
63
+ grouped_features.extend(feature_list)
64
+ return grouped_features
65
+
66
+ def delete_data(self):
67
+ self.data = None
68
+ if self.main_column_node:
69
+ self.main_column_node.delete_data()
70
+ for child in self.children:
71
+ child.delete_data()
72
+
73
+
74
+ class Feature(object):
75
+ def __init__(self, op, children, data=None, display_index=None, cached_display_name=None, alias=None):
76
+ self.op = op
77
+ self.children = children
78
+ self.data = data
79
+ self.display_index = display_index
80
+ self.cached_display_name = cached_display_name
81
+ self.alias = alias
82
+
83
+ def set_op_params(self, params: Dict):
84
+ self.op.set_params(params)
85
+ return self
86
+
87
+ def get_hash(self):
88
+ return hashlib.sha256("_".join([self.op.name] + [ch.name for ch in self.children]).encode("utf-8")).hexdigest()[
89
+ :8
90
+ ]
91
+
92
+ def set_alias(self, alias):
93
+ self.alias = alias
94
+ return self
95
+
96
+ def rename_columns(self, mapping: Dict):
97
+ for child in self.children:
98
+ child.rename_columns(mapping)
99
+ self.cached_display_name = None
100
+ return self
101
+
102
+ def get_column_nodes(self):
103
+ res = []
104
+ for child in self.children:
105
+ res.extend(child.get_column_nodes())
106
+ return res
107
+
108
+ def get_columns(self, **kwargs):
109
+ column_list = []
110
+ seen = set()
111
+ for child in self.children:
112
+ columns = child.get_columns(**kwargs)
113
+ column_list.extend([f for f in columns if f not in seen])
114
+ seen.update(columns)
115
+ return column_list
116
+
117
+ def delete_data(self):
118
+ self.data = None
119
+ for child in self.children:
120
+ child.delete_data()
121
+
122
+ def get_display_name(self, cache: bool = True, shorten: bool = False, **kwargs):
123
+ if self.cached_display_name is not None and cache:
124
+ return self.cached_display_name
125
+
126
+ if self.alias:
127
+ components = ["f_autofe", self.alias]
128
+ elif shorten and not self.op.is_unary:
129
+ components = ["f_autofe", self.op.alias or self.op.name.lower()]
130
+ else:
131
+ components = ["f_" + "_f_".join(self.get_columns(**kwargs))] + [
132
+ "autofe",
133
+ self.op.alias or self.op.name.lower(),
134
+ ]
135
+ components.extend([str(self.display_index)] if self.display_index is not None else [])
136
+ display_name = "_".join(components)
137
+
138
+ if cache:
139
+ self.cached_display_name = display_name
140
+ return display_name
141
+
142
+ def set_display_index(self, index):
143
+ self.display_index = index
144
+ self.cached_display_name = None
145
+ return self
146
+
147
+ def infer_type(self, data):
148
+ if self.op.output_type:
149
+ return self.op.output_type
150
+ else:
151
+ # either a symmetrical operator or group by
152
+ return self.children[0].infer_type(data)
153
+
154
+ def calculate(self, data, is_root=False):
155
+ if isinstance(self.op, PandasOperand) and self.op.is_vector:
156
+ ds = [child.calculate(data) for child in self.children]
157
+ new_data = self.op.calculate(data=ds)
158
+
159
+ elif isinstance(self.op, PandasOperand):
160
+ d1 = self.children[0].calculate(data)
161
+ d2 = None if len(self.children) < 2 else self.children[1].calculate(data)
162
+ new_data = self.op.calculate(data=d1, left=d1, right=d2)
163
+ else:
164
+ raise NotImplementedError(f"Unrecognized operator {self.op.name}.")
165
+
166
+ if (str(new_data.dtype) == "category") | (str(new_data.dtype) == "object"):
167
+ pass
168
+ else:
169
+ new_data = new_data.replace([-np.inf, np.inf], np.nan)
170
+
171
+ if is_root:
172
+ self.data = new_data
173
+ return new_data
174
+
175
+ @staticmethod
176
+ def check_xor(left, right):
177
+ def _get_all_columns(feature):
178
+ if isinstance(feature, Column):
179
+ return [feature.name]
180
+ else:
181
+ res = []
182
+ for child in feature.children:
183
+ res.extend(_get_all_columns(child))
184
+ return res
185
+
186
+ column1 = set(_get_all_columns(left))
187
+ column2 = set(_get_all_columns(right))
188
+ if len(column1 ^ column2) == 0:
189
+ return False
190
+ else:
191
+ return True
192
+
193
+ def to_formula(self, **kwargs):
194
+ if self.op.name in ["+", "-", "*", "/"]:
195
+ left = self.children[0].to_formula(**kwargs)
196
+ right = self.children[1].to_formula(**kwargs)
197
+ return f"({left}{self.op.name}{right})"
198
+ else:
199
+ result = [self.op.name, "("]
200
+ for i in range(len(self.children)):
201
+ string_i = self.children[i].to_formula(**kwargs)
202
+ result.append(string_i)
203
+ result.append(",")
204
+ result.pop()
205
+ result.append(")")
206
+ return "".join(result)
207
+
208
+ @staticmethod
209
+ def from_formula(string):
210
+ if string[-1] != ")":
211
+ return Column(string)
212
+
213
+ def is_trivial_char(c):
214
+ return not (c in "()+-*/,")
215
+
216
+ def find_prev(string):
217
+ if string[-1] != ")":
218
+ return max([(0 if is_trivial_char(c) else i + 1) for i, c in enumerate(string)])
219
+ level, pos = 0, -1
220
+ for i in range(len(string) - 1, -1, -1):
221
+ if string[i] == ")":
222
+ level += 1
223
+ if string[i] == "(":
224
+ level -= 1
225
+ if level == 0:
226
+ pos = i
227
+ break
228
+ while (pos > 0) and is_trivial_char(string[pos - 1]):
229
+ pos -= 1
230
+ return pos
231
+
232
+ p2 = find_prev(string[:-1])
233
+ if string[p2 - 1] == "(":
234
+ return Feature(find_op(string[: p2 - 1]), [Feature.from_formula(string[p2:-1])])
235
+ p1 = find_prev(string[: p2 - 1])
236
+ if string[0] == "(":
237
+ return Feature(
238
+ find_op(string[p2 - 1]),
239
+ [Feature.from_formula(string[p1 : p2 - 1]), Feature.from_formula(string[p2:-1])],
240
+ )
241
+ else:
242
+ op = find_op(string[: p1 - 1])
243
+ if op is not None:
244
+ return Feature(
245
+ op,
246
+ [Feature.from_formula(string[p1 : p2 - 1]), Feature.from_formula(string[p2:-1])],
247
+ )
248
+ else:
249
+ base_features = [
250
+ Feature.from_formula(string[p2:-1]),
251
+ Feature.from_formula(string[p1 : p2 - 1]),
252
+ ]
253
+ while op is None:
254
+ p2 = p1
255
+ p1 = find_prev(string[: p1 - 1])
256
+ base_features.append(Feature.from_formula(string[p1 : p2 - 1]))
257
+ op = find_op(string[: p1 - 1])
258
+ base_features.reverse()
259
+ return Feature(op, base_features)
260
+
261
+
262
+ class Column(object):
263
+ def __init__(self, name, data=None, calculate_all=False):
264
+ self.name = name
265
+ self.data = data
266
+ self.calculate_all = calculate_all
267
+
268
+ def rename_columns(self, mapping: Dict):
269
+ self.name = self._unhash(mapping.get(self.name) or self.name)
270
+ return self
271
+
272
+ def _unhash(self, feature_name):
273
+ last_component_idx = feature_name.rfind("_")
274
+ if not feature_name.startswith("f_"):
275
+ return feature_name # etalon feature
276
+ elif last_component_idx == 1:
277
+ return feature_name[2:] # fully hashed name, cannot unhash
278
+ else:
279
+ return feature_name[2:last_component_idx]
280
+
281
+ def delete_data(self):
282
+ self.data = None
283
+
284
+ def get_column_nodes(self):
285
+ return [self]
286
+
287
+ def get_columns(self):
288
+ return [self.name]
289
+
290
+ def infer_type(self, data):
291
+ return data[self.name].dtype
292
+
293
+ def calculate(self, data):
294
+ self.data = data[self.name]
295
+ return self.data
296
+
297
+ def to_formula(self, **kwargs):
298
+ return str(self.get_columns(**kwargs)[0])
@@ -0,0 +1,82 @@
1
+ from upgini.autofe.operand import PandasOperand, VectorizableMixin
2
+ from typing import Optional
3
+ import pandas as pd
4
+
5
+
6
+ class GroupByThenAgg(PandasOperand, VectorizableMixin):
7
+ agg: Optional[str]
8
+ is_vectorizable = True
9
+ is_grouping = True
10
+ is_distribution_dependent = True
11
+
12
+ def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
13
+ temp = left.groupby(right).agg(self.agg)
14
+ return self._loc(right, temp)
15
+
16
+ def calculate_group(self, data: pd.DataFrame, **kwargs) -> pd.DataFrame:
17
+ group_column, value_columns = self.validate_calculation(data.columns, **kwargs)
18
+ d1 = data[value_columns]
19
+ d2 = data[group_column]
20
+ temp = d1.groupby(d2).agg(self.agg)
21
+ return temp.merge(d2, how="right", on=[group_column])[value_columns]
22
+
23
+
24
+ class GroupByThenMedian(GroupByThenAgg):
25
+ name = "GroupByThenMedian"
26
+ pandas_agg = "median"
27
+ is_distribution_dependent = True
28
+
29
+
30
+ class GroupByThenRank(PandasOperand, VectorizableMixin):
31
+ name = "GroupByThenRank"
32
+ is_vectorizable = True
33
+ is_grouping = True
34
+ output_type = "float"
35
+ is_distribution_dependent = True
36
+
37
+ def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
38
+ temp = pd.DataFrame(left[~right.isna()].groupby(right).rank(ascending=True, pct=True)).reset_index()
39
+ return temp.merge(pd.DataFrame(right).reset_index(), how="right", on=["index"])[left.name]
40
+
41
+ def calculate_group(self, data: pd.DataFrame, **kwargs) -> pd.DataFrame:
42
+ group_column, value_columns = self.validate_calculation(data.columns, **kwargs)
43
+ d1 = data[value_columns]
44
+ d2 = data[group_column]
45
+ temp = d1[~d2.isna()].groupby(d2).rank(ascending=True, pct=True)[value_columns].reset_index()
46
+ return temp.merge(d2.reset_index(), how="right", on=["index"])[value_columns]
47
+
48
+
49
+ class GroupByThenNUnique(PandasOperand, VectorizableMixin):
50
+ name = "GroupByThenNUnique"
51
+ is_vectorizable = True
52
+ is_grouping = True
53
+ output_type = "int"
54
+ is_distribution_dependent = True
55
+ input_type = "discrete"
56
+
57
+ def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
58
+ nunique = left.groupby(right).nunique()
59
+ return self._loc(right, nunique)
60
+
61
+ def calculate_group(self, data: pd.DataFrame, **kwargs) -> pd.DataFrame:
62
+ group_column, value_columns = self.validate_calculation(data.columns, **kwargs)
63
+ d1 = data[value_columns]
64
+ d2 = data[group_column]
65
+ nunique = d1.groupby(d2).nunique()
66
+ return nunique.merge(d2, how="right", on=[group_column])[value_columns]
67
+
68
+
69
+ class GroupByThenFreq(PandasOperand):
70
+ name = "GroupByThenFreq"
71
+ is_grouping = True
72
+ output_type = "float"
73
+ is_distribution_dependent = True
74
+ input_type = "discrete"
75
+
76
+ def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
77
+ def _f(x):
78
+ value_counts = x.value_counts(normalize=True)
79
+ return self._loc(x, value_counts)
80
+
81
+ freq = left.groupby(right).apply(_f)
82
+ return pd.Series(freq, index=right.index)
@@ -0,0 +1,70 @@
1
+ from pydantic import BaseModel
2
+ from typing import Dict, List, Optional, Tuple
3
+ import abc
4
+ import pandas as pd
5
+ import numpy as np
6
+
7
+
8
+ class Operand(BaseModel):
9
+ name: str
10
+ alias: Optional[str]
11
+ is_unary: bool = False
12
+ has_symmetry_importance: bool = False
13
+ input_type: Optional[str]
14
+ output_type: Optional[str]
15
+ is_categorical: bool = False
16
+ is_vectorizable: bool = False
17
+ is_grouping: bool = False
18
+ is_binary: bool = False
19
+ is_vector: bool = False
20
+ is_distribution_dependent: bool = False
21
+ params: Optional[Dict[str, str]]
22
+
23
+ def set_params(self, params: Dict[str, str]):
24
+ self.params = params
25
+ return self
26
+
27
+ def get_params(self) -> Dict[str, str]:
28
+ return self.params
29
+
30
+
31
+ MAIN_COLUMN = "main_column"
32
+
33
+
34
+ class PandasOperand(Operand, abc.ABC):
35
+ def calculate(self, **kwargs) -> pd.Series:
36
+ if self.is_unary:
37
+ return self.calculate_unary(kwargs["data"])
38
+ elif self.is_binary or self.is_grouping:
39
+ return self.calculate_binary(kwargs["left"], kwargs["right"])
40
+ else:
41
+ return self.calculate_vector(kwargs["data"])
42
+
43
+ def calculate_unary(self, data: pd.Series) -> pd.Series:
44
+ pass
45
+
46
+ def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
47
+ pass
48
+
49
+ def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
50
+ pass
51
+
52
+ def calculate_group(self, data: pd.DataFrame, **kwargs) -> pd.DataFrame:
53
+ if not self.is_vectorizable:
54
+ raise RuntimeError(f"Cannot apply calculate_group: operator {self.name} is not vectorizable")
55
+ else:
56
+ raise RuntimeError(f"Unimplemented calculate_group for operator {self.name}")
57
+
58
+ def _loc(self, df_to, df_from):
59
+ df_from.loc[np.nan] = np.nan
60
+ return df_to.fillna(np.nan).apply(lambda x: df_from.loc[x])
61
+
62
+
63
+ class VectorizableMixin(Operand):
64
+ def validate_calculation(self, input_columns: List[str], **kwargs) -> Tuple[str, List[str]]:
65
+ if not kwargs.get(MAIN_COLUMN):
66
+ raise ValueError(f"Expected argument {MAIN_COLUMN} for grouping operator {self.name} not found")
67
+ group_column = kwargs[MAIN_COLUMN]
68
+ value_columns = [col for col in input_columns if col != group_column]
69
+
70
+ return group_column, value_columns
@@ -0,0 +1,105 @@
1
+ from upgini.autofe.operand import PandasOperand
2
+ import numpy as np
3
+ import pandas as pd
4
+
5
+
6
+ class Abs(PandasOperand):
7
+ name = "abs"
8
+ is_unary = True
9
+ is_vectorizable = True
10
+
11
+ def calculate_unary(self, data: pd.Series) -> pd.Series:
12
+ return data.abs()
13
+
14
+ def calculate_group(self, data: pd.DataFrame, **kwargs) -> pd.DataFrame:
15
+ return data.abs()
16
+
17
+
18
+ class Log(PandasOperand):
19
+ name = "log"
20
+ is_unary = True
21
+ is_vectorizable = True
22
+ output_type = "float"
23
+
24
+ def calculate_unary(self, data: pd.Series) -> pd.Series:
25
+ return np.log(np.abs(data.replace(0, np.nan)))
26
+
27
+ def calculate_group(self, data: pd.DataFrame, **kwargs) -> pd.DataFrame:
28
+ return np.log(data.replace(0, np.nan).abs())
29
+
30
+
31
+ class Sqrt(PandasOperand):
32
+ name = "sqrt"
33
+ is_unary = True
34
+ is_vectorizable = True
35
+ output_type = "float"
36
+
37
+ def calculate_unary(self, data: pd.Series) -> pd.Series:
38
+ return np.sqrt(np.abs(data))
39
+
40
+ def calculate_group(self, data: pd.DataFrame, **kwargs) -> pd.DataFrame:
41
+ return np.sqrt(data.abs())
42
+
43
+
44
+ class Square(PandasOperand):
45
+ name = "square"
46
+ is_unary = True
47
+ is_vectorizable = True
48
+
49
+ def calculate_unary(self, data: pd.Series) -> pd.Series:
50
+ return np.square(data)
51
+
52
+ def calculate_group(self, data: pd.DataFrame, **kwargs) -> pd.DataFrame:
53
+ return np.square(data)
54
+
55
+
56
+ class Sigmoid(PandasOperand):
57
+ name = "sigmoid"
58
+ is_unary = True
59
+ is_vectorizable = True
60
+ output_type = "float"
61
+
62
+ def calculate_unary(self, data: pd.Series) -> pd.Series:
63
+ return 1 / (1 + np.exp(-data))
64
+
65
+ def calculate_group(self, data: pd.DataFrame, **kwargs) -> pd.DataFrame:
66
+ return 1 / (1 + np.exp(-data))
67
+
68
+
69
+ class Floor(PandasOperand):
70
+ name = "floor"
71
+ is_unary = True
72
+ is_vectorizable = True
73
+ output_type = "int"
74
+ input_type = "continuous"
75
+
76
+ def calculate_unary(self, data: pd.Series) -> pd.Series:
77
+ return np.floor(data)
78
+
79
+ def calculate_group(self, data: pd.DataFrame, **kwargs) -> pd.DataFrame:
80
+ return np.floor(data)
81
+
82
+
83
+ class Residual(PandasOperand):
84
+ name = "residual"
85
+ is_unary = True
86
+ is_vectorizable = True
87
+ input_type = "continuous"
88
+
89
+ def calculate_unary(self, data: pd.Series) -> pd.Series:
90
+ return data - np.floor(data)
91
+
92
+ def calculate_group(self, data: pd.DataFrame, **kwargs) -> pd.DataFrame:
93
+ return data - np.floor(data)
94
+
95
+
96
+ class Freq(PandasOperand):
97
+ name = "freq"
98
+ is_unary = True
99
+ output_type = "float"
100
+ is_distribution_dependent = True
101
+ input_type = "discrete"
102
+
103
+ def calculate_unary(self, data: pd.Series) -> pd.Series:
104
+ value_counts = data.value_counts(normalize=True)
105
+ return self._loc(data, value_counts)
@@ -0,0 +1,20 @@
1
+ from typing import List
2
+ import pandas as pd
3
+ from upgini.autofe.operand import PandasOperand
4
+
5
+
6
+ class Mean(PandasOperand):
7
+ name = "mean"
8
+ output_type = "float"
9
+ is_vector = True
10
+
11
+ def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
12
+ return pd.DataFrame(data).T.fillna(0).mean(axis=1)
13
+
14
+
15
+ class Sum(PandasOperand):
16
+ name = "sum"
17
+ is_vector = True
18
+
19
+ def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
20
+ return pd.DataFrame(data).T.fillna(0).sum(axis=1)
@@ -904,9 +904,8 @@ class FeaturesEnricher(TransformerMixin):
904
904
 
905
905
  model_task_type = self.model_task_type or define_task(y_sorted, self.logger, silent=True)
906
906
  _cv = cv or self.cv
907
- self.logger.info(f"CV: {_cv}, groups: {groups}")
908
907
  if groups is None and _cv == CVType.group_k_fold:
909
- self.logger.info(f"Replacing group_k_fold with k_fold as no groups were found")
908
+ self.logger.info("Replacing group_k_fold with k_fold as no groups were found")
910
909
  _cv = CVType.k_fold
911
910
  if not isinstance(_cv, BaseCrossValidator):
912
911
  date_column = self._get_date_column(search_keys)
@@ -68,7 +68,7 @@ class SearchKey(Enum):
68
68
  @staticmethod
69
69
  def personal_keys() -> List["SearchKey"]:
70
70
  return [SearchKey.EMAIL, SearchKey.HEM, SearchKey.IP, SearchKey.PHONE]
71
-
71
+
72
72
  @staticmethod
73
73
  def from_meaning_type(meaning_type: FileColumnMeaningType) -> "SearchKey":
74
74
  if meaning_type == FileColumnMeaningType.EMAIL:
File without changes
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: upgini
3
- Version: 1.1.229a3
3
+ Version: 1.1.230
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Home-page: https://upgini.com/
6
6
  Author: Upgini Developers
@@ -20,6 +20,14 @@ src/upgini.egg-info/requires.txt
20
20
  src/upgini.egg-info/top_level.txt
21
21
  src/upgini/ads_management/__init__.py
22
22
  src/upgini/ads_management/ads_manager.py
23
+ src/upgini/autofe/__init__.py
24
+ src/upgini/autofe/all_operands.py
25
+ src/upgini/autofe/binary.py
26
+ src/upgini/autofe/feature.py
27
+ src/upgini/autofe/groupby.py
28
+ src/upgini/autofe/operand.py
29
+ src/upgini/autofe/unary.py
30
+ src/upgini/autofe/vector.py
23
31
  src/upgini/data_source/__init__.py
24
32
  src/upgini/data_source/data_source_publisher.py
25
33
  src/upgini/mdc/__init__.py
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes