upgini 1.1.280a3418.post2__py3-none-any.whl → 1.2.31a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/__about__.py +1 -1
- upgini/__init__.py +4 -20
- upgini/autofe/all_operands.py +39 -10
- upgini/autofe/binary.py +148 -45
- upgini/autofe/date.py +197 -26
- upgini/autofe/feature.py +102 -19
- upgini/autofe/groupby.py +22 -22
- upgini/autofe/operand.py +9 -6
- upgini/autofe/unary.py +78 -54
- upgini/autofe/vector.py +8 -8
- upgini/data_source/data_source_publisher.py +128 -5
- upgini/dataset.py +50 -386
- upgini/features_enricher.py +936 -541
- upgini/http.py +27 -16
- upgini/lazy_import.py +35 -0
- upgini/metadata.py +84 -59
- upgini/metrics.py +164 -34
- upgini/normalizer/normalize_utils.py +197 -0
- upgini/resource_bundle/strings.properties +66 -51
- upgini/search_task.py +10 -4
- upgini/utils/Roboto-Regular.ttf +0 -0
- upgini/utils/base_search_key_detector.py +14 -12
- upgini/utils/country_utils.py +16 -0
- upgini/utils/custom_loss_utils.py +39 -36
- upgini/utils/datetime_utils.py +98 -45
- upgini/utils/deduplicate_utils.py +135 -112
- upgini/utils/display_utils.py +46 -15
- upgini/utils/email_utils.py +54 -16
- upgini/utils/feature_info.py +172 -0
- upgini/utils/features_validator.py +34 -20
- upgini/utils/ip_utils.py +100 -1
- upgini/utils/phone_utils.py +343 -0
- upgini/utils/postal_code_utils.py +34 -0
- upgini/utils/sklearn_ext.py +28 -19
- upgini/utils/target_utils.py +113 -57
- upgini/utils/warning_counter.py +1 -0
- upgini/version_validator.py +8 -4
- {upgini-1.1.280a3418.post2.dist-info → upgini-1.2.31a1.dist-info}/METADATA +31 -16
- upgini-1.2.31a1.dist-info/RECORD +65 -0
- upgini/normalizer/phone_normalizer.py +0 -340
- upgini-1.1.280a3418.post2.dist-info/RECORD +0 -62
- {upgini-1.1.280a3418.post2.dist-info → upgini-1.2.31a1.dist-info}/WHEEL +0 -0
- {upgini-1.1.280a3418.post2.dist-info → upgini-1.2.31a1.dist-info}/licenses/LICENSE +0 -0
upgini/autofe/feature.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import hashlib
|
|
2
2
|
import itertools
|
|
3
|
-
from typing import Dict, List, Optional, Tuple, Union
|
|
3
|
+
from typing import Dict, List, Optional, Set, Tuple, Union
|
|
4
4
|
|
|
5
5
|
import numpy as np
|
|
6
6
|
import pandas as pd
|
|
@@ -16,6 +16,15 @@ class Column:
|
|
|
16
16
|
self.data = data
|
|
17
17
|
self.calculate_all = calculate_all
|
|
18
18
|
|
|
19
|
+
def get_display_name(self, cache: bool = True, shorten: bool = False, **kwargs) -> str:
|
|
20
|
+
return self.name
|
|
21
|
+
|
|
22
|
+
def set_op_params(self, params: Dict[str, str]) -> "Column":
|
|
23
|
+
return self
|
|
24
|
+
|
|
25
|
+
def get_op_params(self, **kwargs):
|
|
26
|
+
return dict()
|
|
27
|
+
|
|
19
28
|
def rename_columns(self, mapping: Dict[str, str]) -> "Column":
|
|
20
29
|
self.name = self._unhash(mapping.get(self.name) or self.name)
|
|
21
30
|
return self
|
|
@@ -35,9 +44,13 @@ class Column:
|
|
|
35
44
|
def get_column_nodes(self) -> List["Column"]:
|
|
36
45
|
return [self]
|
|
37
46
|
|
|
38
|
-
def get_columns(self) -> List[str]:
|
|
47
|
+
def get_columns(self, **kwargs) -> List[str]:
|
|
39
48
|
return [self.name]
|
|
40
49
|
|
|
50
|
+
@property
|
|
51
|
+
def children(self) -> List[Union["Feature", "Column"]]:
|
|
52
|
+
return []
|
|
53
|
+
|
|
41
54
|
def infer_type(self, data: pd.DataFrame) -> DtypeObj:
|
|
42
55
|
return data[self.name].dtype
|
|
43
56
|
|
|
@@ -51,6 +64,12 @@ class Column:
|
|
|
51
64
|
def to_pretty_formula(self) -> str:
|
|
52
65
|
return self.to_formula()
|
|
53
66
|
|
|
67
|
+
def __eq__(self, value: object) -> bool:
|
|
68
|
+
if not isinstance(value, Column):
|
|
69
|
+
return False
|
|
70
|
+
else:
|
|
71
|
+
return self.name == value.name and self.calculate_all == value.calculate_all
|
|
72
|
+
|
|
54
73
|
|
|
55
74
|
class Feature:
|
|
56
75
|
def __init__(
|
|
@@ -69,19 +88,51 @@ class Feature:
|
|
|
69
88
|
self.cached_display_name = cached_display_name
|
|
70
89
|
self.alias = alias
|
|
71
90
|
|
|
72
|
-
def set_op_params(self, params: Dict[str, str]) -> "Feature":
|
|
91
|
+
def set_op_params(self, params: Optional[Dict[str, str]]) -> "Feature":
|
|
92
|
+
obj_dict = self.op.dict().copy()
|
|
93
|
+
obj_dict.update(params or {})
|
|
94
|
+
self.op = self.op.__class__.parse_obj(obj_dict)
|
|
73
95
|
self.op.set_params(params)
|
|
96
|
+
|
|
97
|
+
for child in self.children:
|
|
98
|
+
child_params = {
|
|
99
|
+
k[len(child.get_display_name()) + 1 :]: v
|
|
100
|
+
for k, v in params.items()
|
|
101
|
+
if k.startswith(child.get_display_name())
|
|
102
|
+
}
|
|
103
|
+
if not child_params:
|
|
104
|
+
child_params = params
|
|
105
|
+
child.set_op_params(child_params)
|
|
74
106
|
return self
|
|
75
107
|
|
|
108
|
+
def get_op_params(self, **kwargs) -> Dict[str, str]:
|
|
109
|
+
return {
|
|
110
|
+
k: str(v)
|
|
111
|
+
for k, v in dict(
|
|
112
|
+
(
|
|
113
|
+
(f"{child.get_display_name(**kwargs)}_{k}", v)
|
|
114
|
+
for child in self.children
|
|
115
|
+
for k, v in child.get_op_params(**kwargs).items()
|
|
116
|
+
),
|
|
117
|
+
**(self.op.get_params() or {}),
|
|
118
|
+
).items()
|
|
119
|
+
if v is not None
|
|
120
|
+
}
|
|
121
|
+
|
|
76
122
|
def get_hash(self) -> str:
|
|
77
|
-
return hashlib.sha256(
|
|
78
|
-
|
|
79
|
-
]
|
|
123
|
+
return hashlib.sha256(
|
|
124
|
+
"_".join([self.op.name] + [ch.get_display_name() for ch in self.children]).encode("utf-8")
|
|
125
|
+
).hexdigest()[:8]
|
|
80
126
|
|
|
81
127
|
def set_alias(self, alias: str) -> "Feature":
|
|
82
128
|
self.alias = alias
|
|
83
129
|
return self
|
|
84
130
|
|
|
131
|
+
def get_all_operand_names(self) -> Set[str]:
|
|
132
|
+
return {self.op.name}.union(
|
|
133
|
+
{n for f in self.children if isinstance(f, Feature) for n in f.get_all_operand_names()}
|
|
134
|
+
)
|
|
135
|
+
|
|
85
136
|
def rename_columns(self, mapping: Dict[str, str]) -> "Feature":
|
|
86
137
|
for child in self.children:
|
|
87
138
|
child.rename_columns(mapping)
|
|
@@ -108,19 +159,24 @@ class Feature:
|
|
|
108
159
|
for child in self.children:
|
|
109
160
|
child.delete_data()
|
|
110
161
|
|
|
162
|
+
def get_op_display_name(self) -> str:
|
|
163
|
+
return self.op.alias or self.op.name.lower()
|
|
164
|
+
|
|
111
165
|
def get_display_name(self, cache: bool = True, shorten: bool = False, **kwargs) -> str:
|
|
112
166
|
if self.cached_display_name is not None and cache:
|
|
113
167
|
return self.cached_display_name
|
|
114
168
|
|
|
169
|
+
should_stack_op = not isinstance(self.children[0], Column) if self.op.is_unary else False
|
|
170
|
+
prev_name = [self.children[0].get_op_display_name()] if should_stack_op else []
|
|
171
|
+
|
|
115
172
|
if self.alias:
|
|
116
173
|
components = ["f_autofe", self.alias]
|
|
117
|
-
elif shorten and not self.op.is_unary:
|
|
118
|
-
components = ["f_autofe"
|
|
174
|
+
elif shorten and (not self.op.is_unary or should_stack_op):
|
|
175
|
+
components = ["f_autofe"] + prev_name + [self.get_op_display_name()]
|
|
119
176
|
else:
|
|
120
|
-
components =
|
|
121
|
-
"autofe"
|
|
122
|
-
|
|
123
|
-
]
|
|
177
|
+
components = (
|
|
178
|
+
["f_" + "_f_".join(self.get_columns(**kwargs))] + ["autofe"] + prev_name + [self.get_op_display_name()]
|
|
179
|
+
)
|
|
124
180
|
components.extend([str(self.display_index)] if self.display_index is not None else [])
|
|
125
181
|
display_name = "_".join(components)
|
|
126
182
|
|
|
@@ -211,12 +267,19 @@ class Feature:
|
|
|
211
267
|
|
|
212
268
|
@staticmethod
|
|
213
269
|
def from_formula(string: str) -> Union[Column, "Feature"]:
|
|
214
|
-
if string[-1] != ")":
|
|
215
|
-
return Column(string)
|
|
216
270
|
|
|
217
271
|
def is_trivial_char(c: str) -> bool:
|
|
218
272
|
return c not in "()+-*/,"
|
|
219
273
|
|
|
274
|
+
if string[-1] != ")":
|
|
275
|
+
if all(is_trivial_char(c) for c in string):
|
|
276
|
+
return Column(string)
|
|
277
|
+
else:
|
|
278
|
+
raise ValueError(
|
|
279
|
+
f"Unsupported column name: {string}. Column names should not have characters: "
|
|
280
|
+
"['(', ')', '+', '-', '*', '/', ',']"
|
|
281
|
+
)
|
|
282
|
+
|
|
220
283
|
def find_prev(string: str) -> int:
|
|
221
284
|
if string[-1] != ")":
|
|
222
285
|
return max([(0 if is_trivial_char(c) else i + 1) for i, c in enumerate(string)])
|
|
@@ -238,8 +301,11 @@ class Feature:
|
|
|
238
301
|
return Feature(find_op(string[: p2 - 1]), [Feature.from_formula(string[p2:-1])])
|
|
239
302
|
p1 = find_prev(string[: p2 - 1])
|
|
240
303
|
if string[0] == "(":
|
|
304
|
+
op = find_op(string[p2 - 1])
|
|
305
|
+
if op is None:
|
|
306
|
+
raise ValueError(f"Unsupported operand: {string[p2 - 1]}")
|
|
241
307
|
return Feature(
|
|
242
|
-
|
|
308
|
+
op,
|
|
243
309
|
[Feature.from_formula(string[p1 : p2 - 1]), Feature.from_formula(string[p2:-1])],
|
|
244
310
|
)
|
|
245
311
|
else:
|
|
@@ -250,6 +316,8 @@ class Feature:
|
|
|
250
316
|
[Feature.from_formula(string[p1 : p2 - 1]), Feature.from_formula(string[p2:-1])],
|
|
251
317
|
)
|
|
252
318
|
else:
|
|
319
|
+
if string[p1 - 1] == "(":
|
|
320
|
+
raise ValueError(f"Unsupported operand: {string[: p1 - 1]}")
|
|
253
321
|
base_features = [
|
|
254
322
|
Feature.from_formula(string[p2:-1]),
|
|
255
323
|
Feature.from_formula(string[p1 : p2 - 1]),
|
|
@@ -286,11 +354,26 @@ class FeatureGroup:
|
|
|
286
354
|
return names
|
|
287
355
|
|
|
288
356
|
def calculate(self, data: pd.DataFrame, is_root=False) -> pd.DataFrame:
|
|
289
|
-
main_column = None if self.main_column_node is None else self.main_column_node.get_columns()[0]
|
|
290
357
|
if isinstance(self.op, PandasOperand):
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
358
|
+
main_column = None if self.main_column_node is None else self.main_column_node.get_display_name()
|
|
359
|
+
lower_order_children = []
|
|
360
|
+
if self.main_column_node is not None:
|
|
361
|
+
lower_order_children.append(self.main_column_node)
|
|
362
|
+
lower_order_children.extend(
|
|
363
|
+
ch for f in self.children for ch in f.children if ch.get_display_name() != main_column
|
|
364
|
+
)
|
|
365
|
+
lower_order_names = [ch.get_display_name() for ch in lower_order_children]
|
|
366
|
+
child_data = pd.concat(
|
|
367
|
+
[ch.calculate(data) for ch in lower_order_children],
|
|
368
|
+
axis=1,
|
|
369
|
+
)
|
|
370
|
+
child_data.columns = lower_order_names
|
|
371
|
+
|
|
372
|
+
new_data = self.op.calculate_group(child_data, main_column=main_column)
|
|
373
|
+
new_data.rename(
|
|
374
|
+
columns=dict(zip((n for n in lower_order_names if n != main_column), self.get_display_names())),
|
|
375
|
+
inplace=True,
|
|
376
|
+
)
|
|
294
377
|
else:
|
|
295
378
|
raise NotImplementedError(f"Unrecognized operator {self.op.name}.")
|
|
296
379
|
|
upgini/autofe/groupby.py
CHANGED
|
@@ -7,9 +7,9 @@ from upgini.autofe.operand import PandasOperand, VectorizableMixin
|
|
|
7
7
|
|
|
8
8
|
class GroupByThenAgg(PandasOperand, VectorizableMixin):
|
|
9
9
|
agg: Optional[str]
|
|
10
|
-
is_vectorizable = True
|
|
11
|
-
is_grouping = True
|
|
12
|
-
is_distribution_dependent = True
|
|
10
|
+
is_vectorizable: bool = True
|
|
11
|
+
is_grouping: bool = True
|
|
12
|
+
is_distribution_dependent: bool = True
|
|
13
13
|
|
|
14
14
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
15
15
|
temp = left.groupby(right).agg(self.agg)
|
|
@@ -24,17 +24,17 @@ class GroupByThenAgg(PandasOperand, VectorizableMixin):
|
|
|
24
24
|
|
|
25
25
|
|
|
26
26
|
class GroupByThenMedian(GroupByThenAgg):
|
|
27
|
-
name = "GroupByThenMedian"
|
|
28
|
-
pandas_agg = "median"
|
|
29
|
-
is_distribution_dependent = True
|
|
27
|
+
name: str = "GroupByThenMedian"
|
|
28
|
+
pandas_agg: str = "median"
|
|
29
|
+
is_distribution_dependent: bool = True
|
|
30
30
|
|
|
31
31
|
|
|
32
32
|
class GroupByThenRank(PandasOperand, VectorizableMixin):
|
|
33
|
-
name = "GroupByThenRank"
|
|
34
|
-
is_vectorizable = True
|
|
35
|
-
is_grouping = True
|
|
36
|
-
output_type = "float"
|
|
37
|
-
is_distribution_dependent = True
|
|
33
|
+
name: str = "GroupByThenRank"
|
|
34
|
+
is_vectorizable: bool = True
|
|
35
|
+
is_grouping: bool = True
|
|
36
|
+
output_type: Optional[str] = "float"
|
|
37
|
+
is_distribution_dependent: bool = True
|
|
38
38
|
|
|
39
39
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
40
40
|
temp = pd.DataFrame(left[~right.isna()].groupby(right).rank(ascending=True, pct=True)).reset_index()
|
|
@@ -49,12 +49,12 @@ class GroupByThenRank(PandasOperand, VectorizableMixin):
|
|
|
49
49
|
|
|
50
50
|
|
|
51
51
|
class GroupByThenNUnique(PandasOperand, VectorizableMixin):
|
|
52
|
-
name = "GroupByThenNUnique"
|
|
53
|
-
is_vectorizable = True
|
|
54
|
-
is_grouping = True
|
|
55
|
-
output_type = "int"
|
|
56
|
-
is_distribution_dependent = True
|
|
57
|
-
input_type = "discrete"
|
|
52
|
+
name: str = "GroupByThenNUnique"
|
|
53
|
+
is_vectorizable: bool = True
|
|
54
|
+
is_grouping: bool = True
|
|
55
|
+
output_type: Optional[str] = "int"
|
|
56
|
+
is_distribution_dependent: bool = True
|
|
57
|
+
input_type: Optional[str] = "discrete"
|
|
58
58
|
|
|
59
59
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
60
60
|
nunique = left.groupby(right).nunique()
|
|
@@ -69,11 +69,11 @@ class GroupByThenNUnique(PandasOperand, VectorizableMixin):
|
|
|
69
69
|
|
|
70
70
|
|
|
71
71
|
class GroupByThenFreq(PandasOperand):
|
|
72
|
-
name = "GroupByThenFreq"
|
|
73
|
-
is_grouping = True
|
|
74
|
-
output_type = "float"
|
|
75
|
-
is_distribution_dependent = True
|
|
76
|
-
input_type = "discrete"
|
|
72
|
+
name: str = "GroupByThenFreq"
|
|
73
|
+
is_grouping: bool = True
|
|
74
|
+
output_type: Optional[str] = "float"
|
|
75
|
+
is_distribution_dependent: bool = True
|
|
76
|
+
input_type: Optional[str] = "discrete"
|
|
77
77
|
|
|
78
78
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
79
79
|
def _f(x):
|
upgini/autofe/operand.py
CHANGED
|
@@ -8,25 +8,28 @@ from pydantic import BaseModel
|
|
|
8
8
|
|
|
9
9
|
class Operand(BaseModel):
|
|
10
10
|
name: str
|
|
11
|
-
alias: Optional[str]
|
|
11
|
+
alias: Optional[str] = None
|
|
12
12
|
is_unary: bool = False
|
|
13
|
+
is_symmetrical: bool = False
|
|
13
14
|
has_symmetry_importance: bool = False
|
|
14
|
-
input_type: Optional[str]
|
|
15
|
-
output_type: Optional[str]
|
|
15
|
+
input_type: Optional[str] = None
|
|
16
|
+
output_type: Optional[str] = None
|
|
16
17
|
is_categorical: bool = False
|
|
17
18
|
is_vectorizable: bool = False
|
|
18
19
|
is_grouping: bool = False
|
|
19
20
|
is_binary: bool = False
|
|
20
21
|
is_vector: bool = False
|
|
21
22
|
is_distribution_dependent: bool = False
|
|
22
|
-
params: Optional[Dict[str, str]]
|
|
23
|
+
params: Optional[Dict[str, str]] = None
|
|
23
24
|
|
|
24
25
|
def set_params(self, params: Dict[str, str]):
|
|
25
26
|
self.params = params
|
|
26
27
|
return self
|
|
27
28
|
|
|
28
|
-
def get_params(self) -> Dict[str, str]:
|
|
29
|
-
|
|
29
|
+
def get_params(self) -> Dict[str, Optional[str]]:
|
|
30
|
+
res = {"alias": self.alias}
|
|
31
|
+
res.update(self.params or {})
|
|
32
|
+
return res
|
|
30
33
|
|
|
31
34
|
|
|
32
35
|
MAIN_COLUMN = "main_column"
|
upgini/autofe/unary.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
from typing import Dict, Optional
|
|
1
2
|
import numpy as np
|
|
2
3
|
import pandas as pd
|
|
3
4
|
|
|
@@ -5,24 +6,26 @@ from upgini.autofe.operand import PandasOperand, VectorizableMixin
|
|
|
5
6
|
|
|
6
7
|
|
|
7
8
|
class Abs(PandasOperand, VectorizableMixin):
|
|
8
|
-
name = "abs"
|
|
9
|
-
is_unary = True
|
|
10
|
-
is_vectorizable = True
|
|
11
|
-
group_index = 0
|
|
9
|
+
name: str = "abs"
|
|
10
|
+
is_unary: bool = True
|
|
11
|
+
is_vectorizable: bool = True
|
|
12
|
+
group_index: int = 0
|
|
12
13
|
|
|
13
14
|
def calculate_unary(self, data: pd.Series) -> pd.Series:
|
|
14
|
-
return data.abs()
|
|
15
|
+
return data.astype(np.float64).abs()
|
|
16
|
+
# return data.abs()
|
|
15
17
|
|
|
16
18
|
def calculate_group(self, data: pd.DataFrame, **kwargs) -> pd.DataFrame:
|
|
17
|
-
return data.abs()
|
|
19
|
+
return data.astype(np.float64).abs()
|
|
20
|
+
# return data.abs()
|
|
18
21
|
|
|
19
22
|
|
|
20
23
|
class Log(PandasOperand, VectorizableMixin):
|
|
21
|
-
name = "log"
|
|
22
|
-
is_unary = True
|
|
23
|
-
is_vectorizable = True
|
|
24
|
-
output_type = "float"
|
|
25
|
-
group_index = 0
|
|
24
|
+
name: str = "log"
|
|
25
|
+
is_unary: bool = True
|
|
26
|
+
is_vectorizable: bool = True
|
|
27
|
+
output_type: Optional[str] = "float"
|
|
28
|
+
group_index: int = 0
|
|
26
29
|
|
|
27
30
|
def calculate_unary(self, data: pd.Series) -> pd.Series:
|
|
28
31
|
return self._round_value(np.log(np.abs(data.replace(0, np.nan))), 10)
|
|
@@ -32,11 +35,11 @@ class Log(PandasOperand, VectorizableMixin):
|
|
|
32
35
|
|
|
33
36
|
|
|
34
37
|
class Sqrt(PandasOperand, VectorizableMixin):
|
|
35
|
-
name = "sqrt"
|
|
36
|
-
is_unary = True
|
|
37
|
-
is_vectorizable = True
|
|
38
|
-
output_type = "float"
|
|
39
|
-
group_index = 0
|
|
38
|
+
name: str = "sqrt"
|
|
39
|
+
is_unary: bool = True
|
|
40
|
+
is_vectorizable: bool = True
|
|
41
|
+
output_type: Optional[str] = "float"
|
|
42
|
+
group_index: int = 0
|
|
40
43
|
|
|
41
44
|
def calculate_unary(self, data: pd.Series) -> pd.Series:
|
|
42
45
|
return self._round_value(np.sqrt(np.abs(data)))
|
|
@@ -46,10 +49,10 @@ class Sqrt(PandasOperand, VectorizableMixin):
|
|
|
46
49
|
|
|
47
50
|
|
|
48
51
|
class Square(PandasOperand, VectorizableMixin):
|
|
49
|
-
name = "square"
|
|
50
|
-
is_unary = True
|
|
51
|
-
is_vectorizable = True
|
|
52
|
-
group_index = 0
|
|
52
|
+
name: str = "square"
|
|
53
|
+
is_unary: bool = True
|
|
54
|
+
is_vectorizable: bool = True
|
|
55
|
+
group_index: int = 0
|
|
53
56
|
|
|
54
57
|
def calculate_unary(self, data: pd.Series) -> pd.Series:
|
|
55
58
|
return np.square(data)
|
|
@@ -59,11 +62,11 @@ class Square(PandasOperand, VectorizableMixin):
|
|
|
59
62
|
|
|
60
63
|
|
|
61
64
|
class Sigmoid(PandasOperand, VectorizableMixin):
|
|
62
|
-
name = "sigmoid"
|
|
63
|
-
is_unary = True
|
|
64
|
-
is_vectorizable = True
|
|
65
|
-
output_type = "float"
|
|
66
|
-
group_index = 0
|
|
65
|
+
name: str = "sigmoid"
|
|
66
|
+
is_unary: bool = True
|
|
67
|
+
is_vectorizable: bool = True
|
|
68
|
+
output_type: Optional[str] = "float"
|
|
69
|
+
group_index: int = 0
|
|
67
70
|
|
|
68
71
|
def calculate_unary(self, data: pd.Series) -> pd.Series:
|
|
69
72
|
return self._round_value(1 / (1 + np.exp(-data)))
|
|
@@ -73,12 +76,12 @@ class Sigmoid(PandasOperand, VectorizableMixin):
|
|
|
73
76
|
|
|
74
77
|
|
|
75
78
|
class Floor(PandasOperand, VectorizableMixin):
|
|
76
|
-
name = "floor"
|
|
77
|
-
is_unary = True
|
|
78
|
-
is_vectorizable = True
|
|
79
|
-
output_type = "int"
|
|
80
|
-
input_type = "continuous"
|
|
81
|
-
group_index = 0
|
|
79
|
+
name: str = "floor"
|
|
80
|
+
is_unary: bool = True
|
|
81
|
+
is_vectorizable: bool = True
|
|
82
|
+
output_type: Optional[str] = "int"
|
|
83
|
+
input_type: Optional[str] = "continuous"
|
|
84
|
+
group_index: int = 0
|
|
82
85
|
|
|
83
86
|
def calculate_unary(self, data: pd.Series) -> pd.Series:
|
|
84
87
|
return np.floor(data)
|
|
@@ -88,11 +91,11 @@ class Floor(PandasOperand, VectorizableMixin):
|
|
|
88
91
|
|
|
89
92
|
|
|
90
93
|
class Residual(PandasOperand, VectorizableMixin):
|
|
91
|
-
name = "residual"
|
|
92
|
-
is_unary = True
|
|
93
|
-
is_vectorizable = True
|
|
94
|
-
input_type = "continuous"
|
|
95
|
-
group_index = 0
|
|
94
|
+
name: str = "residual"
|
|
95
|
+
is_unary: bool = True
|
|
96
|
+
is_vectorizable: bool = True
|
|
97
|
+
input_type: Optional[str] = "continuous"
|
|
98
|
+
group_index: int = 0
|
|
96
99
|
|
|
97
100
|
def calculate_unary(self, data: pd.Series) -> pd.Series:
|
|
98
101
|
return data - np.floor(data)
|
|
@@ -102,30 +105,51 @@ class Residual(PandasOperand, VectorizableMixin):
|
|
|
102
105
|
|
|
103
106
|
|
|
104
107
|
class Freq(PandasOperand):
|
|
105
|
-
name = "freq"
|
|
106
|
-
is_unary = True
|
|
107
|
-
output_type = "float"
|
|
108
|
-
is_distribution_dependent = True
|
|
109
|
-
input_type = "discrete"
|
|
108
|
+
name: str = "freq"
|
|
109
|
+
is_unary: bool = True
|
|
110
|
+
output_type: Optional[str] = "float"
|
|
111
|
+
is_distribution_dependent: bool = True
|
|
112
|
+
input_type: Optional[str] = "discrete"
|
|
110
113
|
|
|
111
114
|
def calculate_unary(self, data: pd.Series) -> pd.Series:
|
|
112
115
|
value_counts = data.value_counts(normalize=True)
|
|
113
116
|
return self._loc(data, value_counts)
|
|
114
117
|
|
|
115
118
|
|
|
116
|
-
class
|
|
117
|
-
name = "
|
|
118
|
-
is_unary = True
|
|
119
|
-
output_type = "
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
zero_bound_low: int
|
|
123
|
-
zero_bound_high: int
|
|
124
|
-
step: int
|
|
119
|
+
class Norm(PandasOperand):
|
|
120
|
+
name: str = "norm"
|
|
121
|
+
is_unary: bool = True
|
|
122
|
+
output_type: Optional[str] = "float"
|
|
123
|
+
norm: Optional[float] = None
|
|
125
124
|
|
|
126
125
|
def calculate_unary(self, data: pd.Series) -> pd.Series:
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
126
|
+
data_dropna = data.dropna()
|
|
127
|
+
if data_dropna.empty:
|
|
128
|
+
return data
|
|
129
|
+
|
|
130
|
+
if self.norm is not None:
|
|
131
|
+
normalized_data = data / self.norm
|
|
132
|
+
else:
|
|
133
|
+
self.norm = np.sqrt(np.sum(data * data))
|
|
134
|
+
normalized_data = data / self.norm
|
|
135
|
+
|
|
136
|
+
return normalized_data
|
|
137
|
+
|
|
138
|
+
def set_params(self, params: Dict[str, str]):
|
|
139
|
+
super().set_params(params)
|
|
140
|
+
if params is not None and "norm" in params:
|
|
141
|
+
self.norm = float(params["norm"])
|
|
142
|
+
return self
|
|
143
|
+
|
|
144
|
+
def get_params(self) -> Dict[str, Optional[str]]:
|
|
145
|
+
res = super().get_params()
|
|
146
|
+
if self.norm is not None:
|
|
147
|
+
res["norm"] = self.norm
|
|
131
148
|
return res
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
class Embeddings(PandasOperand):
|
|
152
|
+
name: str = "emb"
|
|
153
|
+
is_unary: bool = True
|
|
154
|
+
input_type: Optional[str] = "string"
|
|
155
|
+
output_type: Optional[str] = "vector"
|
upgini/autofe/vector.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import List
|
|
1
|
+
from typing import List, Optional
|
|
2
2
|
|
|
3
3
|
import pandas as pd
|
|
4
4
|
|
|
@@ -6,19 +6,19 @@ from upgini.autofe.operand import PandasOperand, VectorizableMixin
|
|
|
6
6
|
|
|
7
7
|
|
|
8
8
|
class Mean(PandasOperand, VectorizableMixin):
|
|
9
|
-
name = "mean"
|
|
10
|
-
output_type = "float"
|
|
11
|
-
is_vector = True
|
|
12
|
-
group_index = 0
|
|
9
|
+
name: str = "mean"
|
|
10
|
+
output_type: Optional[str] = "float"
|
|
11
|
+
is_vector: bool = True
|
|
12
|
+
group_index: int = 0
|
|
13
13
|
|
|
14
14
|
def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
|
|
15
15
|
return pd.DataFrame(data).T.fillna(0).mean(axis=1)
|
|
16
16
|
|
|
17
17
|
|
|
18
18
|
class Sum(PandasOperand, VectorizableMixin):
|
|
19
|
-
name = "sum"
|
|
20
|
-
is_vector = True
|
|
21
|
-
group_index = 0
|
|
19
|
+
name: str = "sum"
|
|
20
|
+
is_vector: bool = True
|
|
21
|
+
group_index: int = 0
|
|
22
22
|
|
|
23
23
|
def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
|
|
24
24
|
return pd.DataFrame(data).T.fillna(0).sum(axis=1)
|