upgini 1.1.228__py3-none-any.whl → 1.1.231__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

File without changes
@@ -0,0 +1,43 @@
1
+ from typing import Dict
2
+ from upgini.autofe.groupby import GroupByThenAgg, GroupByThenRank
3
+ from upgini.autofe.operand import Operand
4
+ from upgini.autofe.unary import Abs, Log, Residual, Sqrt, Square, Sigmoid, Floor, Freq
5
+ from upgini.autofe.binary import Min, Max, Add, Subtract, Multiply, Divide, Sim
6
+ from upgini.autofe.vector import Mean, Sum
7
+
8
+ ALL_OPERANDS: Dict[str, Operand] = {
9
+ op.name: op
10
+ for op in [
11
+ Freq(),
12
+ Mean(),
13
+ Sum(),
14
+ Abs(),
15
+ Log(),
16
+ Sqrt(),
17
+ Square(),
18
+ Sigmoid(),
19
+ Floor(),
20
+ Residual(),
21
+ Min(),
22
+ Max(),
23
+ Add(),
24
+ Subtract(),
25
+ Multiply(),
26
+ Divide(),
27
+ GroupByThenAgg(name="GroupByThenMin", agg="min"),
28
+ GroupByThenAgg(name="GroupByThenMax", agg="max"),
29
+ GroupByThenAgg(name="GroupByThenMean", agg="mean"),
30
+ GroupByThenAgg(name="GroupByThenMedian", agg="median"),
31
+ GroupByThenAgg(name="GroupByThenStd", output_type="float", agg="std"),
32
+ GroupByThenRank(),
33
+ Operand(name="Combine", has_symmetry_importance=True, output_type="object", is_categorical=True),
34
+ Operand(name="CombineThenFreq", has_symmetry_importance=True, output_type="float"),
35
+ Operand(name="GroupByThenNUnique", output_type="int", is_vectorizable=True, is_grouping=True),
36
+ Operand(name="GroupByThenFreq", output_type="float", is_grouping=True),
37
+ Sim(),
38
+ ]
39
+ }
40
+
41
+
42
+ def find_op(name):
43
+ return ALL_OPERANDS.get(name)
@@ -0,0 +1,133 @@
1
+ from upgini.autofe.operand import PandasOperand, VectorizableMixin
2
+ import numpy as np
3
+ import pandas as pd
4
+ from numpy import dot
5
+ from numpy.linalg import norm
6
+
7
+
8
+ class Min(PandasOperand):
9
+ name = "min"
10
+ is_binary = True
11
+ has_symmetry_importance = True
12
+
13
+ def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
14
+ return np.minimum(left, right)
15
+
16
+
17
+ class Max(PandasOperand):
18
+ name = "max"
19
+ is_binary = True
20
+ has_symmetry_importance = True
21
+
22
+ def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
23
+ return np.maximum(left, right)
24
+
25
+
26
+ class Add(PandasOperand, VectorizableMixin):
27
+ name = "+"
28
+ alias = "add"
29
+ is_binary = True
30
+ has_symmetry_importance = True
31
+ is_vectorizable = True
32
+
33
+ def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
34
+ return left + right
35
+
36
+ def calculate_group(self, data: pd.DataFrame, **kwargs) -> pd.DataFrame:
37
+ group_column, value_columns = self.validate_calculation(data.columns, **kwargs)
38
+ d1 = data[value_columns]
39
+ d2 = data[group_column]
40
+
41
+ return d1.add(d2, axis=0)
42
+
43
+
44
+ class Subtract(PandasOperand, VectorizableMixin):
45
+ name = "-"
46
+ alias = "sub"
47
+ is_binary = True
48
+ has_symmetry_importance = True
49
+ is_vectorizable = True
50
+
51
+ def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
52
+ return left - right
53
+
54
+ def calculate_group(self, data: pd.DataFrame, **kwargs) -> pd.DataFrame:
55
+ group_column, value_columns = self.validate_calculation(data.columns, **kwargs)
56
+ d1 = data[value_columns]
57
+ d2 = data[group_column]
58
+
59
+ return d1.sub(d2, axis=0)
60
+
61
+
62
+ class Multiply(PandasOperand, VectorizableMixin):
63
+ name = "*"
64
+ alias = "mul"
65
+ is_binary = True
66
+ has_symmetry_importance = True
67
+ is_vectorizable = True
68
+
69
+ def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
70
+ return left * right
71
+
72
+ def calculate_group(self, data: pd.DataFrame, **kwargs) -> pd.DataFrame:
73
+ group_column, value_columns = self.validate_calculation(data.columns, **kwargs)
74
+ d1 = data[value_columns]
75
+ d2 = data[group_column]
76
+
77
+ return d1.mul(d2, axis=0)
78
+
79
+
80
+ class Divide(PandasOperand, VectorizableMixin):
81
+ name = "/"
82
+ alias = "div"
83
+ is_binary = True
84
+ has_symmetry_importance = True
85
+ is_vectorizable = True
86
+ output_type = "float"
87
+
88
+ def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
89
+ return left / right.replace(0, np.nan)
90
+
91
+ def calculate_group(self, data: pd.DataFrame, **kwargs) -> pd.DataFrame:
92
+ group_column, value_columns = self.validate_calculation(data.columns, **kwargs)
93
+ d1 = data[value_columns]
94
+ d2 = data[group_column]
95
+
96
+ return d1.div(d2.replace(0, np.nan), axis=0)
97
+
98
+
99
+ class Combine(PandasOperand):
100
+ name = "Combine"
101
+ is_binary = True
102
+ has_symmetry_importance = True
103
+ output_type = "object"
104
+
105
+ def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
106
+ temp = left.astype(str) + "_" + right.astype(str)
107
+ temp[left.isna() | right.isna()] = np.nan
108
+ return pd.Series(temp, index=left.index)
109
+
110
+
111
+ class CombineThenFreq(PandasOperand):
112
+ name = "CombineThenFreq"
113
+ is_binary = True
114
+ has_symmetry_importance = True
115
+ output_type = "float"
116
+ is_distribution_dependent = True
117
+ input_type = "discrete"
118
+
119
+ def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
120
+ temp = left.astype(str) + "_" + right.astype(str)
121
+ temp[left.isna() | right.isna()] = np.nan
122
+ value_counts = temp.value_counts(normalize=True)
123
+ self._loc(temp, value_counts)
124
+
125
+
126
+ class Sim(PandasOperand):
127
+ name = "sim"
128
+ is_binary = True
129
+ output_type = "float"
130
+ has_symmetry_importance = True
131
+
132
+ def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
133
+ return dot(left, right) / (norm(left) * norm(right))
@@ -0,0 +1,298 @@
1
+ import hashlib
2
+ from typing import Dict
3
+ import numpy as np
4
+ import pandas as pd
5
+ import itertools
6
+ from upgini.autofe.operand import PandasOperand
7
+ from upgini.autofe.all_operands import (
8
+ find_op,
9
+ )
10
+
11
+
12
+ class FeatureGroup(object):
13
+ def __init__(self, op, main_column, children):
14
+ self.op = op
15
+ self.main_column_node = main_column
16
+ self.children = children
17
+ self.data = None
18
+
19
+ def get_columns(self, **kwargs):
20
+ column_list = []
21
+ seen = set()
22
+ for child in self.children:
23
+ columns = child.get_columns(**kwargs)
24
+ column_list.extend([f for f in columns if f not in seen])
25
+ seen.update(columns)
26
+ return column_list
27
+
28
+ def get_display_names(self, **kwargs):
29
+ names = [f.get_display_name(**kwargs) for f in self.children]
30
+ return names
31
+
32
+ def calculate(self, data: pd.DataFrame, is_root=False):
33
+ main_column = None if self.main_column_node is None else self.main_column_node.get_columns()[0]
34
+ if isinstance(self.op, PandasOperand):
35
+ columns = self.get_columns()
36
+ new_data = self.op.calculate_group(data[columns], main_column=main_column)
37
+ new_data.rename(columns=dict(zip(columns, self.get_display_names())), inplace=True)
38
+
39
+ else:
40
+ raise NotImplementedError(f"Unrecognized operator {self.op.name}.")
41
+
42
+ new_data.replace([-np.inf, np.inf], np.nan, inplace=True)
43
+
44
+ if is_root:
45
+ self.data = new_data
46
+ return new_data
47
+
48
+ @staticmethod
49
+ def make_groups(candidates):
50
+ grouped_features = []
51
+ for op_child, features in itertools.groupby(
52
+ candidates, lambda f: (f.op, f.children[0] if f.op.is_unary or f.op.is_vector else f.children[1])
53
+ ):
54
+ op, main_child = op_child
55
+ feature_list = list(features)
56
+ if op.is_vectorizable:
57
+ if op.is_unary:
58
+ group = FeatureGroup(op, main_column=None, children=feature_list)
59
+ else:
60
+ group = FeatureGroup(op, main_column=main_child, children=feature_list)
61
+ grouped_features.append(group)
62
+ else:
63
+ grouped_features.extend(feature_list)
64
+ return grouped_features
65
+
66
+ def delete_data(self):
67
+ self.data = None
68
+ if self.main_column_node:
69
+ self.main_column_node.delete_data()
70
+ for child in self.children:
71
+ child.delete_data()
72
+
73
+
74
+ class Feature(object):
75
+ def __init__(self, op, children, data=None, display_index=None, cached_display_name=None, alias=None):
76
+ self.op = op
77
+ self.children = children
78
+ self.data = data
79
+ self.display_index = display_index
80
+ self.cached_display_name = cached_display_name
81
+ self.alias = alias
82
+
83
+ def set_op_params(self, params: Dict):
84
+ self.op.set_params(params)
85
+ return self
86
+
87
+ def get_hash(self):
88
+ return hashlib.sha256("_".join([self.op.name] + [ch.name for ch in self.children]).encode("utf-8")).hexdigest()[
89
+ :8
90
+ ]
91
+
92
+ def set_alias(self, alias):
93
+ self.alias = alias
94
+ return self
95
+
96
+ def rename_columns(self, mapping: Dict):
97
+ for child in self.children:
98
+ child.rename_columns(mapping)
99
+ self.cached_display_name = None
100
+ return self
101
+
102
+ def get_column_nodes(self):
103
+ res = []
104
+ for child in self.children:
105
+ res.extend(child.get_column_nodes())
106
+ return res
107
+
108
+ def get_columns(self, **kwargs):
109
+ column_list = []
110
+ seen = set()
111
+ for child in self.children:
112
+ columns = child.get_columns(**kwargs)
113
+ column_list.extend([f for f in columns if f not in seen])
114
+ seen.update(columns)
115
+ return column_list
116
+
117
+ def delete_data(self):
118
+ self.data = None
119
+ for child in self.children:
120
+ child.delete_data()
121
+
122
+ def get_display_name(self, cache: bool = True, shorten: bool = False, **kwargs):
123
+ if self.cached_display_name is not None and cache:
124
+ return self.cached_display_name
125
+
126
+ if self.alias:
127
+ components = ["f_autofe", self.alias]
128
+ elif shorten and not self.op.is_unary:
129
+ components = ["f_autofe", self.op.alias or self.op.name.lower()]
130
+ else:
131
+ components = ["f_" + "_f_".join(self.get_columns(**kwargs))] + [
132
+ "autofe",
133
+ self.op.alias or self.op.name.lower(),
134
+ ]
135
+ components.extend([str(self.display_index)] if self.display_index is not None else [])
136
+ display_name = "_".join(components)
137
+
138
+ if cache:
139
+ self.cached_display_name = display_name
140
+ return display_name
141
+
142
+ def set_display_index(self, index):
143
+ self.display_index = index
144
+ self.cached_display_name = None
145
+ return self
146
+
147
+ def infer_type(self, data):
148
+ if self.op.output_type:
149
+ return self.op.output_type
150
+ else:
151
+ # either a symmetrical operator or group by
152
+ return self.children[0].infer_type(data)
153
+
154
+ def calculate(self, data, is_root=False):
155
+ if isinstance(self.op, PandasOperand) and self.op.is_vector:
156
+ ds = [child.calculate(data) for child in self.children]
157
+ new_data = self.op.calculate(data=ds)
158
+
159
+ elif isinstance(self.op, PandasOperand):
160
+ d1 = self.children[0].calculate(data)
161
+ d2 = None if len(self.children) < 2 else self.children[1].calculate(data)
162
+ new_data = self.op.calculate(data=d1, left=d1, right=d2)
163
+ else:
164
+ raise NotImplementedError(f"Unrecognized operator {self.op.name}.")
165
+
166
+ if (str(new_data.dtype) == "category") | (str(new_data.dtype) == "object"):
167
+ pass
168
+ else:
169
+ new_data = new_data.replace([-np.inf, np.inf], np.nan)
170
+
171
+ if is_root:
172
+ self.data = new_data
173
+ return new_data
174
+
175
+ @staticmethod
176
+ def check_xor(left, right):
177
+ def _get_all_columns(feature):
178
+ if isinstance(feature, Column):
179
+ return [feature.name]
180
+ else:
181
+ res = []
182
+ for child in feature.children:
183
+ res.extend(_get_all_columns(child))
184
+ return res
185
+
186
+ column1 = set(_get_all_columns(left))
187
+ column2 = set(_get_all_columns(right))
188
+ if len(column1 ^ column2) == 0:
189
+ return False
190
+ else:
191
+ return True
192
+
193
+ def to_formula(self, **kwargs):
194
+ if self.op.name in ["+", "-", "*", "/"]:
195
+ left = self.children[0].to_formula(**kwargs)
196
+ right = self.children[1].to_formula(**kwargs)
197
+ return f"({left}{self.op.name}{right})"
198
+ else:
199
+ result = [self.op.name, "("]
200
+ for i in range(len(self.children)):
201
+ string_i = self.children[i].to_formula(**kwargs)
202
+ result.append(string_i)
203
+ result.append(",")
204
+ result.pop()
205
+ result.append(")")
206
+ return "".join(result)
207
+
208
+ @staticmethod
209
+ def from_formula(string):
210
+ if string[-1] != ")":
211
+ return Column(string)
212
+
213
+ def is_trivial_char(c):
214
+ return not (c in "()+-*/,")
215
+
216
+ def find_prev(string):
217
+ if string[-1] != ")":
218
+ return max([(0 if is_trivial_char(c) else i + 1) for i, c in enumerate(string)])
219
+ level, pos = 0, -1
220
+ for i in range(len(string) - 1, -1, -1):
221
+ if string[i] == ")":
222
+ level += 1
223
+ if string[i] == "(":
224
+ level -= 1
225
+ if level == 0:
226
+ pos = i
227
+ break
228
+ while (pos > 0) and is_trivial_char(string[pos - 1]):
229
+ pos -= 1
230
+ return pos
231
+
232
+ p2 = find_prev(string[:-1])
233
+ if string[p2 - 1] == "(":
234
+ return Feature(find_op(string[: p2 - 1]), [Feature.from_formula(string[p2:-1])])
235
+ p1 = find_prev(string[: p2 - 1])
236
+ if string[0] == "(":
237
+ return Feature(
238
+ find_op(string[p2 - 1]),
239
+ [Feature.from_formula(string[p1 : p2 - 1]), Feature.from_formula(string[p2:-1])],
240
+ )
241
+ else:
242
+ op = find_op(string[: p1 - 1])
243
+ if op is not None:
244
+ return Feature(
245
+ op,
246
+ [Feature.from_formula(string[p1 : p2 - 1]), Feature.from_formula(string[p2:-1])],
247
+ )
248
+ else:
249
+ base_features = [
250
+ Feature.from_formula(string[p2:-1]),
251
+ Feature.from_formula(string[p1 : p2 - 1]),
252
+ ]
253
+ while op is None:
254
+ p2 = p1
255
+ p1 = find_prev(string[: p1 - 1])
256
+ base_features.append(Feature.from_formula(string[p1 : p2 - 1]))
257
+ op = find_op(string[: p1 - 1])
258
+ base_features.reverse()
259
+ return Feature(op, base_features)
260
+
261
+
262
+ class Column(object):
263
+ def __init__(self, name, data=None, calculate_all=False):
264
+ self.name = name
265
+ self.data = data
266
+ self.calculate_all = calculate_all
267
+
268
+ def rename_columns(self, mapping: Dict):
269
+ self.name = self._unhash(mapping.get(self.name) or self.name)
270
+ return self
271
+
272
+ def _unhash(self, feature_name):
273
+ last_component_idx = feature_name.rfind("_")
274
+ if not feature_name.startswith("f_"):
275
+ return feature_name # etalon feature
276
+ elif last_component_idx == 1:
277
+ return feature_name[2:] # fully hashed name, cannot unhash
278
+ else:
279
+ return feature_name[2:last_component_idx]
280
+
281
+ def delete_data(self):
282
+ self.data = None
283
+
284
+ def get_column_nodes(self):
285
+ return [self]
286
+
287
+ def get_columns(self):
288
+ return [self.name]
289
+
290
+ def infer_type(self, data):
291
+ return data[self.name].dtype
292
+
293
+ def calculate(self, data):
294
+ self.data = data[self.name]
295
+ return self.data
296
+
297
+ def to_formula(self, **kwargs):
298
+ return str(self.get_columns(**kwargs)[0])
@@ -0,0 +1,82 @@
1
+ from upgini.autofe.operand import PandasOperand, VectorizableMixin
2
+ from typing import Optional
3
+ import pandas as pd
4
+
5
+
6
+ class GroupByThenAgg(PandasOperand, VectorizableMixin):
7
+ agg: Optional[str]
8
+ is_vectorizable = True
9
+ is_grouping = True
10
+ is_distribution_dependent = True
11
+
12
+ def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
13
+ temp = left.groupby(right).agg(self.agg)
14
+ return self._loc(right, temp)
15
+
16
+ def calculate_group(self, data: pd.DataFrame, **kwargs) -> pd.DataFrame:
17
+ group_column, value_columns = self.validate_calculation(data.columns, **kwargs)
18
+ d1 = data[value_columns]
19
+ d2 = data[group_column]
20
+ temp = d1.groupby(d2).agg(self.agg)
21
+ return temp.merge(d2, how="right", on=[group_column])[value_columns]
22
+
23
+
24
+ class GroupByThenMedian(GroupByThenAgg):
25
+ name = "GroupByThenMedian"
26
+ pandas_agg = "median"
27
+ is_distribution_dependent = True
28
+
29
+
30
+ class GroupByThenRank(PandasOperand, VectorizableMixin):
31
+ name = "GroupByThenRank"
32
+ is_vectorizable = True
33
+ is_grouping = True
34
+ output_type = "float"
35
+ is_distribution_dependent = True
36
+
37
+ def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
38
+ temp = pd.DataFrame(left[~right.isna()].groupby(right).rank(ascending=True, pct=True)).reset_index()
39
+ return temp.merge(pd.DataFrame(right).reset_index(), how="right", on=["index"])[left.name]
40
+
41
+ def calculate_group(self, data: pd.DataFrame, **kwargs) -> pd.DataFrame:
42
+ group_column, value_columns = self.validate_calculation(data.columns, **kwargs)
43
+ d1 = data[value_columns]
44
+ d2 = data[group_column]
45
+ temp = d1[~d2.isna()].groupby(d2).rank(ascending=True, pct=True)[value_columns].reset_index()
46
+ return temp.merge(d2.reset_index(), how="right", on=["index"])[value_columns]
47
+
48
+
49
+ class GroupByThenNUnique(PandasOperand, VectorizableMixin):
50
+ name = "GroupByThenNUnique"
51
+ is_vectorizable = True
52
+ is_grouping = True
53
+ output_type = "int"
54
+ is_distribution_dependent = True
55
+ input_type = "discrete"
56
+
57
+ def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
58
+ nunique = left.groupby(right).nunique()
59
+ return self._loc(right, nunique)
60
+
61
+ def calculate_group(self, data: pd.DataFrame, **kwargs) -> pd.DataFrame:
62
+ group_column, value_columns = self.validate_calculation(data.columns, **kwargs)
63
+ d1 = data[value_columns]
64
+ d2 = data[group_column]
65
+ nunique = d1.groupby(d2).nunique()
66
+ return nunique.merge(d2, how="right", on=[group_column])[value_columns]
67
+
68
+
69
+ class GroupByThenFreq(PandasOperand):
70
+ name = "GroupByThenFreq"
71
+ is_grouping = True
72
+ output_type = "float"
73
+ is_distribution_dependent = True
74
+ input_type = "discrete"
75
+
76
+ def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
77
+ def _f(x):
78
+ value_counts = x.value_counts(normalize=True)
79
+ return self._loc(x, value_counts)
80
+
81
+ freq = left.groupby(right).apply(_f)
82
+ return pd.Series(freq, index=right.index)
@@ -0,0 +1,70 @@
1
+ from pydantic import BaseModel
2
+ from typing import Dict, List, Optional, Tuple
3
+ import abc
4
+ import pandas as pd
5
+ import numpy as np
6
+
7
+
8
+ class Operand(BaseModel):
9
+ name: str
10
+ alias: Optional[str]
11
+ is_unary: bool = False
12
+ has_symmetry_importance: bool = False
13
+ input_type: Optional[str]
14
+ output_type: Optional[str]
15
+ is_categorical: bool = False
16
+ is_vectorizable: bool = False
17
+ is_grouping: bool = False
18
+ is_binary: bool = False
19
+ is_vector: bool = False
20
+ is_distribution_dependent: bool = False
21
+ params: Optional[Dict[str, str]]
22
+
23
+ def set_params(self, params: Dict[str, str]):
24
+ self.params = params
25
+ return self
26
+
27
+ def get_params(self) -> Dict[str, str]:
28
+ return self.params
29
+
30
+
31
+ MAIN_COLUMN = "main_column"
32
+
33
+
34
+ class PandasOperand(Operand, abc.ABC):
35
+ def calculate(self, **kwargs) -> pd.Series:
36
+ if self.is_unary:
37
+ return self.calculate_unary(kwargs["data"])
38
+ elif self.is_binary or self.is_grouping:
39
+ return self.calculate_binary(kwargs["left"], kwargs["right"])
40
+ else:
41
+ return self.calculate_vector(kwargs["data"])
42
+
43
+ def calculate_unary(self, data: pd.Series) -> pd.Series:
44
+ pass
45
+
46
+ def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
47
+ pass
48
+
49
+ def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
50
+ pass
51
+
52
+ def calculate_group(self, data: pd.DataFrame, **kwargs) -> pd.DataFrame:
53
+ if not self.is_vectorizable:
54
+ raise RuntimeError(f"Cannot apply calculate_group: operator {self.name} is not vectorizable")
55
+ else:
56
+ raise RuntimeError(f"Unimplemented calculate_group for operator {self.name}")
57
+
58
+ def _loc(self, df_to, df_from):
59
+ df_from.loc[np.nan] = np.nan
60
+ return df_to.fillna(np.nan).apply(lambda x: df_from.loc[x])
61
+
62
+
63
+ class VectorizableMixin(Operand):
64
+ def validate_calculation(self, input_columns: List[str], **kwargs) -> Tuple[str, List[str]]:
65
+ if not kwargs.get(MAIN_COLUMN):
66
+ raise ValueError(f"Expected argument {MAIN_COLUMN} for grouping operator {self.name} not found")
67
+ group_column = kwargs[MAIN_COLUMN]
68
+ value_columns = [col for col in input_columns if col != group_column]
69
+
70
+ return group_column, value_columns
upgini/autofe/unary.py ADDED
@@ -0,0 +1,105 @@
1
+ from upgini.autofe.operand import PandasOperand
2
+ import numpy as np
3
+ import pandas as pd
4
+
5
+
6
+ class Abs(PandasOperand):
7
+ name = "abs"
8
+ is_unary = True
9
+ is_vectorizable = True
10
+
11
+ def calculate_unary(self, data: pd.Series) -> pd.Series:
12
+ return data.abs()
13
+
14
+ def calculate_group(self, data: pd.DataFrame, **kwargs) -> pd.DataFrame:
15
+ return data.abs()
16
+
17
+
18
+ class Log(PandasOperand):
19
+ name = "log"
20
+ is_unary = True
21
+ is_vectorizable = True
22
+ output_type = "float"
23
+
24
+ def calculate_unary(self, data: pd.Series) -> pd.Series:
25
+ return np.log(np.abs(data.replace(0, np.nan)))
26
+
27
+ def calculate_group(self, data: pd.DataFrame, **kwargs) -> pd.DataFrame:
28
+ return np.log(data.replace(0, np.nan).abs())
29
+
30
+
31
+ class Sqrt(PandasOperand):
32
+ name = "sqrt"
33
+ is_unary = True
34
+ is_vectorizable = True
35
+ output_type = "float"
36
+
37
+ def calculate_unary(self, data: pd.Series) -> pd.Series:
38
+ return np.sqrt(np.abs(data))
39
+
40
+ def calculate_group(self, data: pd.DataFrame, **kwargs) -> pd.DataFrame:
41
+ return np.sqrt(data.abs())
42
+
43
+
44
+ class Square(PandasOperand):
45
+ name = "square"
46
+ is_unary = True
47
+ is_vectorizable = True
48
+
49
+ def calculate_unary(self, data: pd.Series) -> pd.Series:
50
+ return np.square(data)
51
+
52
+ def calculate_group(self, data: pd.DataFrame, **kwargs) -> pd.DataFrame:
53
+ return np.square(data)
54
+
55
+
56
+ class Sigmoid(PandasOperand):
57
+ name = "sigmoid"
58
+ is_unary = True
59
+ is_vectorizable = True
60
+ output_type = "float"
61
+
62
+ def calculate_unary(self, data: pd.Series) -> pd.Series:
63
+ return 1 / (1 + np.exp(-data))
64
+
65
+ def calculate_group(self, data: pd.DataFrame, **kwargs) -> pd.DataFrame:
66
+ return 1 / (1 + np.exp(-data))
67
+
68
+
69
+ class Floor(PandasOperand):
70
+ name = "floor"
71
+ is_unary = True
72
+ is_vectorizable = True
73
+ output_type = "int"
74
+ input_type = "continuous"
75
+
76
+ def calculate_unary(self, data: pd.Series) -> pd.Series:
77
+ return np.floor(data)
78
+
79
+ def calculate_group(self, data: pd.DataFrame, **kwargs) -> pd.DataFrame:
80
+ return np.floor(data)
81
+
82
+
83
+ class Residual(PandasOperand):
84
+ name = "residual"
85
+ is_unary = True
86
+ is_vectorizable = True
87
+ input_type = "continuous"
88
+
89
+ def calculate_unary(self, data: pd.Series) -> pd.Series:
90
+ return data - np.floor(data)
91
+
92
+ def calculate_group(self, data: pd.DataFrame, **kwargs) -> pd.DataFrame:
93
+ return data - np.floor(data)
94
+
95
+
96
+ class Freq(PandasOperand):
97
+ name = "freq"
98
+ is_unary = True
99
+ output_type = "float"
100
+ is_distribution_dependent = True
101
+ input_type = "discrete"
102
+
103
+ def calculate_unary(self, data: pd.Series) -> pd.Series:
104
+ value_counts = data.value_counts(normalize=True)
105
+ return self._loc(data, value_counts)
@@ -0,0 +1,20 @@
1
+ from typing import List
2
+ import pandas as pd
3
+ from upgini.autofe.operand import PandasOperand
4
+
5
+
6
+ class Mean(PandasOperand):
7
+ name = "mean"
8
+ output_type = "float"
9
+ is_vector = True
10
+
11
+ def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
12
+ return pd.DataFrame(data).T.fillna(0).mean(axis=1)
13
+
14
+
15
+ class Sum(PandasOperand):
16
+ name = "sum"
17
+ is_vector = True
18
+
19
+ def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
20
+ return pd.DataFrame(data).T.fillna(0).sum(axis=1)
@@ -5,7 +5,6 @@ import logging
5
5
  import numbers
6
6
  import os
7
7
  import pickle
8
- import re
9
8
  import sys
10
9
  import tempfile
11
10
  import time
@@ -22,6 +21,7 @@ from scipy.stats import ks_2samp
22
21
  from sklearn.base import TransformerMixin
23
22
  from sklearn.exceptions import NotFittedError
24
23
  from sklearn.model_selection import BaseCrossValidator
24
+ from upgini.autofe.feature import Feature
25
25
 
26
26
  from upgini.data_source.data_source_publisher import CommercialSchema
27
27
  from upgini.dataset import Dataset
@@ -251,6 +251,7 @@ class FeaturesEnricher(TransformerMixin):
251
251
  raise e
252
252
 
253
253
  self.runtime_parameters = runtime_parameters or RuntimeParameters()
254
+ self.runtime_parameters.properties["feature_generation_params.hash_index"] = True
254
255
  self.date_format = date_format
255
256
  self.random_state = random_state
256
257
  self.detect_missing_search_keys = detect_missing_search_keys
@@ -904,6 +905,9 @@ class FeaturesEnricher(TransformerMixin):
904
905
 
905
906
  model_task_type = self.model_task_type or define_task(y_sorted, self.logger, silent=True)
906
907
  _cv = cv or self.cv
908
+ if groups is None and _cv == CVType.group_k_fold:
909
+ self.logger.info("Replacing group_k_fold with k_fold as no groups were found")
910
+ _cv = CVType.k_fold
907
911
  if not isinstance(_cv, BaseCrossValidator):
908
912
  date_column = self._get_date_column(search_keys)
909
913
  date_series = validated_X[date_column] if date_column is not None else None
@@ -1629,9 +1633,9 @@ class FeaturesEnricher(TransformerMixin):
1629
1633
  c.originalName or c.name for c in file_metadata.columns if c.name in features_for_transform
1630
1634
  ]
1631
1635
  features_section = (
1632
- ', "features": {' +
1633
- ", ".join([f'"{feature}": "test_value"' for feature in original_features_for_transform]) +
1634
- "}"
1636
+ ', "features": {'
1637
+ + ", ".join([f'"{feature}": "test_value"' for feature in original_features_for_transform])
1638
+ + "}"
1635
1639
  )
1636
1640
  else:
1637
1641
  features_section = ""
@@ -2269,7 +2273,7 @@ class FeaturesEnricher(TransformerMixin):
2269
2273
  msg = bundle.get("multivariate_timeseries_detected")
2270
2274
  self.__override_cv(CVType.blocked_time_series, msg, print_warning=False)
2271
2275
  elif (
2272
- (self.cv is None or self.cv == CVType.k_fold)
2276
+ self.cv is None
2273
2277
  and model_task_type != ModelTaskType.REGRESSION
2274
2278
  and self._get_group_columns(self.fit_search_keys)
2275
2279
  ):
@@ -2907,13 +2911,20 @@ class FeaturesEnricher(TransformerMixin):
2907
2911
 
2908
2912
  descriptions = []
2909
2913
  for m in autofe_meta:
2914
+ autofe_feature = Feature.from_formula(m.formula)
2915
+ if autofe_feature.op.is_vector:
2916
+ continue
2917
+
2910
2918
  description = dict()
2911
2919
 
2912
2920
  feature_meta = get_feature_by_display_index(m.display_index)
2913
2921
  if feature_meta is None:
2914
2922
  self.logger.warning(f"Feature meta for display index {m.display_index} not found")
2915
2923
  continue
2916
- description["Sources"] = feature_meta.data_source.replace("AutoFE: features from ", "")
2924
+ description["shap"] = feature_meta.shap_value
2925
+ description["Sources"] = feature_meta.data_source\
2926
+ .replace("AutoFE: features from ", "")\
2927
+ .replace("AutoFE: feature from ", "")
2917
2928
  description["Feature name"] = feature_meta.name
2918
2929
 
2919
2930
  feature_idx = 1
@@ -2921,11 +2932,7 @@ class FeaturesEnricher(TransformerMixin):
2921
2932
  description[f"Feature {feature_idx}"] = bc.hashed_name
2922
2933
  feature_idx += 1
2923
2934
 
2924
- match = re.match(f"f_autofe_(.+)_{m.display_index}", feature_meta.name)
2925
- if match is None:
2926
- self.logger.warning(f"Failed to infer autofe function from name {feature_meta.name}")
2927
- else:
2928
- description["Function"] = match.group(1)
2935
+ description["Function"] = autofe_feature.op.name
2929
2936
 
2930
2937
  descriptions.append(description)
2931
2938
 
@@ -2934,6 +2941,8 @@ class FeaturesEnricher(TransformerMixin):
2934
2941
 
2935
2942
  descriptions_df = pd.DataFrame(descriptions)
2936
2943
  descriptions_df.fillna("", inplace=True)
2944
+ descriptions_df.sort_values(by="shap", ascending=False, inplace=True)
2945
+ descriptions_df.drop(columns="shap", inplace=True)
2937
2946
  return descriptions_df
2938
2947
  except Exception:
2939
2948
  self.logger.exception("Failed to generate AutoFE features description")
upgini/metadata.py CHANGED
@@ -68,7 +68,7 @@ class SearchKey(Enum):
68
68
  @staticmethod
69
69
  def personal_keys() -> List["SearchKey"]:
70
70
  return [SearchKey.EMAIL, SearchKey.HEM, SearchKey.IP, SearchKey.PHONE]
71
-
71
+
72
72
  @staticmethod
73
73
  def from_meaning_type(meaning_type: FileColumnMeaningType) -> "SearchKey":
74
74
  if meaning_type == FileColumnMeaningType.EMAIL:
@@ -247,11 +247,12 @@ class BaseColumnMetadata(BaseModel):
247
247
  original_name: str
248
248
  hashed_name: str
249
249
  ads_definition_id: Optional[str]
250
+ is_augmented: bool
250
251
 
251
252
 
252
253
  class GeneratedFeatureMetadata(BaseModel):
253
254
  formula: str # on hashed names
254
- display_index: int
255
+ display_index: str
255
256
  base_columns: List[BaseColumnMetadata]
256
257
 
257
258
 
upgini/utils/cv_utils.py CHANGED
@@ -22,7 +22,7 @@ class CVConfig:
22
22
  elif isinstance(cv_type, CVType):
23
23
  self.cv_type = cv_type
24
24
  else:
25
- raise Exception(f"Unexcpected type of cv_type: {type(cv_type)}")
25
+ raise Exception(f"Unexpected type of cv_type: {type(cv_type)}")
26
26
 
27
27
  self.shuffle_kfold: Optional[bool] = shuffle_kfold
28
28
  self.test_size = 0.2
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: upgini
3
- Version: 1.1.228
3
+ Version: 1.1.231
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Home-page: https://upgini.com/
6
6
  Author: Upgini Developers
@@ -2,16 +2,24 @@ upgini/__init__.py,sha256=asENHgEVHQBIkV-e_0IhE_ZWqkCG6398U3ZLrNzAH6k,407
2
2
  upgini/ads.py,sha256=mre6xn44wcC_fg63iLT_kTh4mViZqR9AKRJZAtpQz8Y,2592
3
3
  upgini/dataset.py,sha256=7z9zbVvd1_MiufmoZlCwEHwQ25Q2DX_0g9PFcSMlqMY,49764
4
4
  upgini/errors.py,sha256=BqpvfhW2jJW5fa5KXj0alhXatGl-WK4xTl309-QNLp8,959
5
- upgini/features_enricher.py,sha256=Zk-zVTVdGK5Wlywda7e7W43UYNFLtMIiNg7hzwJlzf0,158363
5
+ upgini/features_enricher.py,sha256=2B9rk_8QNMV7o1khbgZX8A1T6vJqyfki4F4UAYoR0po,158857
6
6
  upgini/fingerprint.js,sha256=VygVIQlN1v4NGZfjHqtRogOw8zjTnnMNJg_f7M5iGQU,33442
7
7
  upgini/http.py,sha256=HzUSZudCdISJGUqHC1gAT1v_x1n_dIFVDJW4z3Q7DCs,41204
8
- upgini/metadata.py,sha256=050EPfvwDRv676e-ZFr3OQ1qyR0AFLEFfhqD_iqxBQE,9565
8
+ upgini/metadata.py,sha256=FZ5CQluLLWrfrBVThSIes1SW6wcs7n50aNZwzYnHiF0,9584
9
9
  upgini/metrics.py,sha256=YeYHJtEIs8OG-EzidG-nbSYB919pjZ4MMbdcZ_jfV2s,23639
10
10
  upgini/search_task.py,sha256=7YxH1zrUHMmePO0VbPBBCJjeoer7jAC0Gltc9EVAOIg,17126
11
11
  upgini/spinner.py,sha256=yhakBaydMNS8E8TRAwTdCMdnWrHeWT0cR1M8c9hP6jA,1157
12
12
  upgini/version_validator.py,sha256=rDIncP6BEko4J2F2hUcMOtKm_vZbI4ICWcNcw8hrwM4,1400
13
13
  upgini/ads_management/__init__.py,sha256=qzyisOToVRP-tquAJD1PblZhNtMrOB8FiyF9JvfkvgE,50
14
14
  upgini/ads_management/ads_manager.py,sha256=O6Pcl_y5e_ULfQ-xmGGn_qBP4z7EtV7TP9etjrsLkLE,2647
15
+ upgini/autofe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
16
+ upgini/autofe/all_operands.py,sha256=du44N6ISWe3ikb0y9ZzSOHNbLiyEYrJPwoBo0Z6xp2s,1487
17
+ upgini/autofe/binary.py,sha256=f8LQqZi9zyaMUAv-jASMmWNA_vT05ncYCjZq0qx3USs,3972
18
+ upgini/autofe/feature.py,sha256=cElNcLfw9BeBVUkkaFzWWXrnyWNUCXiw0FGqsitorbE,10133
19
+ upgini/autofe/groupby.py,sha256=iXRfOmOc84ooSzRhsh9GmmG7rTafX0-ekXko8s9Qs68,3089
20
+ upgini/autofe/operand.py,sha256=8WqEoSIA5rEWCK1xuC303E4NW5a72GZ5jUMAEj4skII,2291
21
+ upgini/autofe/unary.py,sha256=7TBe7PCt7l_XQEqu_G5g_TC2cW3tppL7uPDcX8xsqz0,2731
22
+ upgini/autofe/vector.py,sha256=Qk7VmdwURNwVw7fIMEspWEo7HTiyUWCYIqu3hcWQQio,507
15
23
  upgini/data_source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
16
24
  upgini/data_source/data_source_publisher.py,sha256=zFu0WMKwPM11gPZHq8dpsBP7s4wmTtBqYoDEakgNxoY,13725
17
25
  upgini/mdc/__init__.py,sha256=CuKmWYCqAnmiq1S7wgMzJhSCTsXuoeiZWXSfzw0lyig,1152
@@ -30,7 +38,7 @@ upgini/utils/base_search_key_detector.py,sha256=DGwhXLvc8i5VZWMDr0rncFfV5GEHdsCS
30
38
  upgini/utils/blocked_time_series.py,sha256=dMz5ewk3PsoeOrc3lDzInCVPS9u_2XQkV0W6PuMMjPg,3380
31
39
  upgini/utils/country_utils.py,sha256=9BXSXoGm3nVoOZE_bRENY-KMkwMUFvAF3Au0zxUNA1o,6436
32
40
  upgini/utils/custom_loss_utils.py,sha256=DBslpjWGPt7xTeypt78baR59012SYphbPsO_YLKdilo,3972
33
- upgini/utils/cv_utils.py,sha256=NTVd7itVWiyrEiM2LPXHGXIzMCncdGR4SRxrOu57Euc,2492
41
+ upgini/utils/cv_utils.py,sha256=6pSSL_Ft_8C6n6aInJeiyeSBD7McjsMxKZpHqSBV0uY,2491
34
42
  upgini/utils/datetime_utils.py,sha256=P56e7gcgAogJYfs2Blzk1uypxb9yrFzNaeJpMCRm6Zc,7716
35
43
  upgini/utils/display_utils.py,sha256=tiq5sFOfMwkKCjQ7OGdyK_twe0Qdr9F3mzkW1QXSDog,10664
36
44
  upgini/utils/email_utils.py,sha256=MhCLUAWqbp81xRyKizauNhVx6t_MFeJQRQ8pFM7EpFo,3480
@@ -45,8 +53,8 @@ upgini/utils/sklearn_ext.py,sha256=IMx2La70AXAggApVpT7sMEjWqVWon5AMZt4MARDsIMQ,4
45
53
  upgini/utils/target_utils.py,sha256=cu52icjhDIPpEStHYMXrD2hIl9gzvfnxZr0Ra5osV0k,1616
46
54
  upgini/utils/track_info.py,sha256=DVNVZmXUb4f25DSPEuUNEFx49hNEBfmuY9iSW5jkMnI,5708
47
55
  upgini/utils/warning_counter.py,sha256=vnmdFo5-7GBkU2bK9h_uC0K0Y_wtfcYstxOdeRfacO0,228
48
- upgini-1.1.228.dist-info/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
49
- upgini-1.1.228.dist-info/METADATA,sha256=efz_h9E6ySqjajEYyIDs_C60RESULmo11iPnhd04FL0,48398
50
- upgini-1.1.228.dist-info/WHEEL,sha256=G16H4A3IeoQmnOrYV4ueZGKSjhipXx8zc8nu9FGlvMA,92
51
- upgini-1.1.228.dist-info/top_level.txt,sha256=OFhTGiDIWKl5gFI49qvWq1R9IKflPaE2PekcbDXDtx4,7
52
- upgini-1.1.228.dist-info/RECORD,,
56
+ upgini-1.1.231.dist-info/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
57
+ upgini-1.1.231.dist-info/METADATA,sha256=itfTK0u6dBS8WjaITVm9TcsXC2mgQ_W0LRBqB_Nialk,48398
58
+ upgini-1.1.231.dist-info/WHEEL,sha256=G16H4A3IeoQmnOrYV4ueZGKSjhipXx8zc8nu9FGlvMA,92
59
+ upgini-1.1.231.dist-info/top_level.txt,sha256=OFhTGiDIWKl5gFI49qvWq1R9IKflPaE2PekcbDXDtx4,7
60
+ upgini-1.1.231.dist-info/RECORD,,