upgini 1.1.231a2__tar.gz → 1.1.232a2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- {upgini-1.1.231a2/src/upgini.egg-info → upgini-1.1.232a2}/PKG-INFO +1 -1
- {upgini-1.1.231a2 → upgini-1.1.232a2}/setup.py +1 -1
- {upgini-1.1.231a2 → upgini-1.1.232a2}/src/upgini/autofe/feature.py +134 -111
- {upgini-1.1.231a2 → upgini-1.1.232a2}/src/upgini/features_enricher.py +3 -0
- {upgini-1.1.231a2 → upgini-1.1.232a2/src/upgini.egg-info}/PKG-INFO +1 -1
- {upgini-1.1.231a2 → upgini-1.1.232a2}/LICENSE +0 -0
- {upgini-1.1.231a2 → upgini-1.1.232a2}/README.md +0 -0
- {upgini-1.1.231a2 → upgini-1.1.232a2}/pyproject.toml +0 -0
- {upgini-1.1.231a2 → upgini-1.1.232a2}/setup.cfg +0 -0
- {upgini-1.1.231a2 → upgini-1.1.232a2}/src/upgini/__init__.py +0 -0
- {upgini-1.1.231a2 → upgini-1.1.232a2}/src/upgini/ads.py +0 -0
- {upgini-1.1.231a2 → upgini-1.1.232a2}/src/upgini/ads_management/__init__.py +0 -0
- {upgini-1.1.231a2 → upgini-1.1.232a2}/src/upgini/ads_management/ads_manager.py +0 -0
- {upgini-1.1.231a2 → upgini-1.1.232a2}/src/upgini/autofe/__init__.py +0 -0
- {upgini-1.1.231a2 → upgini-1.1.232a2}/src/upgini/autofe/all_operands.py +0 -0
- {upgini-1.1.231a2 → upgini-1.1.232a2}/src/upgini/autofe/binary.py +0 -0
- {upgini-1.1.231a2 → upgini-1.1.232a2}/src/upgini/autofe/groupby.py +0 -0
- {upgini-1.1.231a2 → upgini-1.1.232a2}/src/upgini/autofe/operand.py +0 -0
- {upgini-1.1.231a2 → upgini-1.1.232a2}/src/upgini/autofe/unary.py +0 -0
- {upgini-1.1.231a2 → upgini-1.1.232a2}/src/upgini/autofe/vector.py +0 -0
- {upgini-1.1.231a2 → upgini-1.1.232a2}/src/upgini/data_source/__init__.py +0 -0
- {upgini-1.1.231a2 → upgini-1.1.232a2}/src/upgini/data_source/data_source_publisher.py +0 -0
- {upgini-1.1.231a2 → upgini-1.1.232a2}/src/upgini/dataset.py +0 -0
- {upgini-1.1.231a2 → upgini-1.1.232a2}/src/upgini/errors.py +0 -0
- {upgini-1.1.231a2 → upgini-1.1.232a2}/src/upgini/fingerprint.js +0 -0
- {upgini-1.1.231a2 → upgini-1.1.232a2}/src/upgini/http.py +0 -0
- {upgini-1.1.231a2 → upgini-1.1.232a2}/src/upgini/mdc/__init__.py +0 -0
- {upgini-1.1.231a2 → upgini-1.1.232a2}/src/upgini/mdc/context.py +0 -0
- {upgini-1.1.231a2 → upgini-1.1.232a2}/src/upgini/metadata.py +0 -0
- {upgini-1.1.231a2 → upgini-1.1.232a2}/src/upgini/metrics.py +0 -0
- {upgini-1.1.231a2 → upgini-1.1.232a2}/src/upgini/normalizer/__init__.py +0 -0
- {upgini-1.1.231a2 → upgini-1.1.232a2}/src/upgini/normalizer/phone_normalizer.py +0 -0
- {upgini-1.1.231a2 → upgini-1.1.232a2}/src/upgini/resource_bundle/__init__.py +0 -0
- {upgini-1.1.231a2 → upgini-1.1.232a2}/src/upgini/resource_bundle/exceptions.py +0 -0
- {upgini-1.1.231a2 → upgini-1.1.232a2}/src/upgini/resource_bundle/strings.properties +0 -0
- {upgini-1.1.231a2 → upgini-1.1.232a2}/src/upgini/sampler/__init__.py +0 -0
- {upgini-1.1.231a2 → upgini-1.1.232a2}/src/upgini/sampler/base.py +0 -0
- {upgini-1.1.231a2 → upgini-1.1.232a2}/src/upgini/sampler/random_under_sampler.py +0 -0
- {upgini-1.1.231a2 → upgini-1.1.232a2}/src/upgini/sampler/utils.py +0 -0
- {upgini-1.1.231a2 → upgini-1.1.232a2}/src/upgini/search_task.py +0 -0
- {upgini-1.1.231a2 → upgini-1.1.232a2}/src/upgini/spinner.py +0 -0
- {upgini-1.1.231a2 → upgini-1.1.232a2}/src/upgini/utils/__init__.py +0 -0
- {upgini-1.1.231a2 → upgini-1.1.232a2}/src/upgini/utils/base_search_key_detector.py +0 -0
- {upgini-1.1.231a2 → upgini-1.1.232a2}/src/upgini/utils/blocked_time_series.py +0 -0
- {upgini-1.1.231a2 → upgini-1.1.232a2}/src/upgini/utils/country_utils.py +0 -0
- {upgini-1.1.231a2 → upgini-1.1.232a2}/src/upgini/utils/custom_loss_utils.py +0 -0
- {upgini-1.1.231a2 → upgini-1.1.232a2}/src/upgini/utils/cv_utils.py +0 -0
- {upgini-1.1.231a2 → upgini-1.1.232a2}/src/upgini/utils/datetime_utils.py +0 -0
- {upgini-1.1.231a2 → upgini-1.1.232a2}/src/upgini/utils/display_utils.py +0 -0
- {upgini-1.1.231a2 → upgini-1.1.232a2}/src/upgini/utils/email_utils.py +0 -0
- {upgini-1.1.231a2 → upgini-1.1.232a2}/src/upgini/utils/fallback_progress_bar.py +0 -0
- {upgini-1.1.231a2 → upgini-1.1.232a2}/src/upgini/utils/features_validator.py +0 -0
- {upgini-1.1.231a2 → upgini-1.1.232a2}/src/upgini/utils/format.py +0 -0
- {upgini-1.1.231a2 → upgini-1.1.232a2}/src/upgini/utils/ip_utils.py +0 -0
- {upgini-1.1.231a2 → upgini-1.1.232a2}/src/upgini/utils/phone_utils.py +0 -0
- {upgini-1.1.231a2 → upgini-1.1.232a2}/src/upgini/utils/postal_code_utils.py +0 -0
- {upgini-1.1.231a2 → upgini-1.1.232a2}/src/upgini/utils/progress_bar.py +0 -0
- {upgini-1.1.231a2 → upgini-1.1.232a2}/src/upgini/utils/sklearn_ext.py +0 -0
- {upgini-1.1.231a2 → upgini-1.1.232a2}/src/upgini/utils/target_utils.py +0 -0
- {upgini-1.1.231a2 → upgini-1.1.232a2}/src/upgini/utils/track_info.py +0 -0
- {upgini-1.1.231a2 → upgini-1.1.232a2}/src/upgini/utils/warning_counter.py +0 -0
- {upgini-1.1.231a2 → upgini-1.1.232a2}/src/upgini/version_validator.py +0 -0
- {upgini-1.1.231a2 → upgini-1.1.232a2}/src/upgini.egg-info/SOURCES.txt +0 -0
- {upgini-1.1.231a2 → upgini-1.1.232a2}/src/upgini.egg-info/dependency_links.txt +0 -0
- {upgini-1.1.231a2 → upgini-1.1.232a2}/src/upgini.egg-info/requires.txt +0 -0
- {upgini-1.1.231a2 → upgini-1.1.232a2}/src/upgini.egg-info/top_level.txt +0 -0
- {upgini-1.1.231a2 → upgini-1.1.232a2}/tests/test_binary_dataset.py +0 -0
- {upgini-1.1.231a2 → upgini-1.1.232a2}/tests/test_blocked_time_series.py +0 -0
- {upgini-1.1.231a2 → upgini-1.1.232a2}/tests/test_categorical_dataset.py +0 -0
- {upgini-1.1.231a2 → upgini-1.1.232a2}/tests/test_continuous_dataset.py +0 -0
- {upgini-1.1.231a2 → upgini-1.1.232a2}/tests/test_country_utils.py +0 -0
- {upgini-1.1.231a2 → upgini-1.1.232a2}/tests/test_custom_loss_utils.py +0 -0
- {upgini-1.1.231a2 → upgini-1.1.232a2}/tests/test_datetime_utils.py +0 -0
- {upgini-1.1.231a2 → upgini-1.1.232a2}/tests/test_email_utils.py +0 -0
- {upgini-1.1.231a2 → upgini-1.1.232a2}/tests/test_etalon_validation.py +0 -0
- {upgini-1.1.231a2 → upgini-1.1.232a2}/tests/test_features_enricher.py +0 -0
- {upgini-1.1.231a2 → upgini-1.1.232a2}/tests/test_metrics.py +0 -0
- {upgini-1.1.231a2 → upgini-1.1.232a2}/tests/test_phone_utils.py +0 -0
- {upgini-1.1.231a2 → upgini-1.1.232a2}/tests/test_postal_code_utils.py +0 -0
- {upgini-1.1.231a2 → upgini-1.1.232a2}/tests/test_widget.py +0 -0
|
@@ -1,78 +1,61 @@
|
|
|
1
1
|
import hashlib
|
|
2
|
-
|
|
2
|
+
import itertools
|
|
3
|
+
from typing import Dict, List, Optional, Tuple, Union
|
|
4
|
+
|
|
3
5
|
import numpy as np
|
|
4
6
|
import pandas as pd
|
|
5
|
-
import
|
|
6
|
-
from upgini.autofe.operand import PandasOperand
|
|
7
|
-
from upgini.autofe.all_operands import (
|
|
8
|
-
find_op,
|
|
9
|
-
)
|
|
7
|
+
from pandas._typing import DtypeObj
|
|
10
8
|
|
|
9
|
+
from upgini.autofe.all_operands import find_op
|
|
10
|
+
from upgini.autofe.operand import Operand, PandasOperand
|
|
11
11
|
|
|
12
|
-
class FeatureGroup(object):
|
|
13
|
-
def __init__(self, op, main_column, children):
|
|
14
|
-
self.op = op
|
|
15
|
-
self.main_column_node = main_column
|
|
16
|
-
self.children = children
|
|
17
|
-
self.data = None
|
|
18
|
-
|
|
19
|
-
def get_columns(self, **kwargs):
|
|
20
|
-
column_list = []
|
|
21
|
-
seen = set()
|
|
22
|
-
for child in self.children:
|
|
23
|
-
columns = child.get_columns(**kwargs)
|
|
24
|
-
column_list.extend([f for f in columns if f not in seen])
|
|
25
|
-
seen.update(columns)
|
|
26
|
-
return column_list
|
|
27
12
|
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
13
|
+
class Column:
|
|
14
|
+
def __init__(self, name: str, data: Optional[pd.Series] = None, calculate_all=False):
|
|
15
|
+
self.name = name
|
|
16
|
+
self.data = data
|
|
17
|
+
self.calculate_all = calculate_all
|
|
31
18
|
|
|
32
|
-
def
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
columns = self.get_columns()
|
|
36
|
-
new_data = self.op.calculate_group(data[columns], main_column=main_column)
|
|
37
|
-
new_data.rename(columns=dict(zip(columns, self.get_display_names())), inplace=True)
|
|
19
|
+
def rename_columns(self, mapping: Dict[str, str]) -> "Column":
|
|
20
|
+
self.name = self._unhash(mapping.get(self.name) or self.name)
|
|
21
|
+
return self
|
|
38
22
|
|
|
23
|
+
def _unhash(self, feature_name: str) -> str:
|
|
24
|
+
last_component_idx = feature_name.rfind("_")
|
|
25
|
+
if not feature_name.startswith("f_"):
|
|
26
|
+
return feature_name # etalon feature
|
|
27
|
+
elif last_component_idx == 1:
|
|
28
|
+
return feature_name[2:] # fully hashed name, cannot unhash
|
|
39
29
|
else:
|
|
40
|
-
|
|
30
|
+
return feature_name[2:last_component_idx]
|
|
41
31
|
|
|
42
|
-
|
|
32
|
+
def delete_data(self):
|
|
33
|
+
self.data = None
|
|
43
34
|
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
return new_data
|
|
35
|
+
def get_column_nodes(self) -> List["Column"]:
|
|
36
|
+
return [self]
|
|
47
37
|
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
grouped_features = []
|
|
51
|
-
for op_child, features in itertools.groupby(
|
|
52
|
-
candidates, lambda f: (f.op, f.children[0] if f.op.is_unary or f.op.is_vector else f.children[1])
|
|
53
|
-
):
|
|
54
|
-
op, main_child = op_child
|
|
55
|
-
feature_list = list(features)
|
|
56
|
-
if op.is_vectorizable:
|
|
57
|
-
if op.is_unary:
|
|
58
|
-
group = FeatureGroup(op, main_column=None, children=feature_list)
|
|
59
|
-
else:
|
|
60
|
-
group = FeatureGroup(op, main_column=main_child, children=feature_list)
|
|
61
|
-
grouped_features.append(group)
|
|
62
|
-
else:
|
|
63
|
-
grouped_features.extend(feature_list)
|
|
64
|
-
return grouped_features
|
|
38
|
+
def get_columns(self) -> List[str]:
|
|
39
|
+
return [self.name]
|
|
65
40
|
|
|
66
|
-
def
|
|
67
|
-
self.
|
|
68
|
-
if self.main_column_node:
|
|
69
|
-
self.main_column_node.delete_data()
|
|
70
|
-
for child in self.children:
|
|
71
|
-
child.delete_data()
|
|
41
|
+
def infer_type(self, data: pd.DataFrame) -> DtypeObj:
|
|
42
|
+
return data[self.name].dtype
|
|
72
43
|
|
|
44
|
+
def calculate(self, data: pd.DataFrame) -> pd.Series:
|
|
45
|
+
self.data = data[self.name]
|
|
46
|
+
return self.data
|
|
47
|
+
|
|
48
|
+
def to_formula(self, **kwargs) -> str:
|
|
49
|
+
return str(self.get_columns(**kwargs)[0])
|
|
73
50
|
|
|
74
|
-
|
|
75
|
-
|
|
51
|
+
def to_pretty_formula(self) -> str:
|
|
52
|
+
return self.to_formula()
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
class Feature:
|
|
56
|
+
def __init__(self, op: Operand, children: List[Union[Column, "Feature"]], data: Optional[pd.DataFrame] = None,
|
|
57
|
+
display_index: Optional[str] = None, cached_display_name: Optional[str] = None,
|
|
58
|
+
alias: Optional[str] = None):
|
|
76
59
|
self.op = op
|
|
77
60
|
self.children = children
|
|
78
61
|
self.data = data
|
|
@@ -80,32 +63,32 @@ class Feature(object):
|
|
|
80
63
|
self.cached_display_name = cached_display_name
|
|
81
64
|
self.alias = alias
|
|
82
65
|
|
|
83
|
-
def set_op_params(self, params: Dict):
|
|
66
|
+
def set_op_params(self, params: Dict[str, str]) -> "Feature":
|
|
84
67
|
self.op.set_params(params)
|
|
85
68
|
return self
|
|
86
69
|
|
|
87
|
-
def get_hash(self):
|
|
70
|
+
def get_hash(self) -> str:
|
|
88
71
|
return hashlib.sha256("_".join([self.op.name] + [ch.name for ch in self.children]).encode("utf-8")).hexdigest()[
|
|
89
72
|
:8
|
|
90
73
|
]
|
|
91
74
|
|
|
92
|
-
def set_alias(self, alias):
|
|
75
|
+
def set_alias(self, alias: str) -> "Feature":
|
|
93
76
|
self.alias = alias
|
|
94
77
|
return self
|
|
95
78
|
|
|
96
|
-
def rename_columns(self, mapping: Dict):
|
|
79
|
+
def rename_columns(self, mapping: Dict[str, str]) -> "Feature":
|
|
97
80
|
for child in self.children:
|
|
98
81
|
child.rename_columns(mapping)
|
|
99
82
|
self.cached_display_name = None
|
|
100
83
|
return self
|
|
101
84
|
|
|
102
|
-
def get_column_nodes(self):
|
|
85
|
+
def get_column_nodes(self) -> List[Union[Column, "Feature"]]:
|
|
103
86
|
res = []
|
|
104
87
|
for child in self.children:
|
|
105
88
|
res.extend(child.get_column_nodes())
|
|
106
89
|
return res
|
|
107
90
|
|
|
108
|
-
def get_columns(self, **kwargs):
|
|
91
|
+
def get_columns(self, **kwargs) -> List[str]:
|
|
109
92
|
column_list = []
|
|
110
93
|
seen = set()
|
|
111
94
|
for child in self.children:
|
|
@@ -119,7 +102,7 @@ class Feature(object):
|
|
|
119
102
|
for child in self.children:
|
|
120
103
|
child.delete_data()
|
|
121
104
|
|
|
122
|
-
def get_display_name(self, cache: bool = True, shorten: bool = False, **kwargs):
|
|
105
|
+
def get_display_name(self, cache: bool = True, shorten: bool = False, **kwargs) -> str:
|
|
123
106
|
if self.cached_display_name is not None and cache:
|
|
124
107
|
return self.cached_display_name
|
|
125
108
|
|
|
@@ -139,27 +122,27 @@ class Feature(object):
|
|
|
139
122
|
self.cached_display_name = display_name
|
|
140
123
|
return display_name
|
|
141
124
|
|
|
142
|
-
def set_display_index(self, index):
|
|
125
|
+
def set_display_index(self, index) -> "Feature":
|
|
143
126
|
self.display_index = index
|
|
144
127
|
self.cached_display_name = None
|
|
145
128
|
return self
|
|
146
129
|
|
|
147
|
-
def infer_type(self, data):
|
|
130
|
+
def infer_type(self, data: pd.DataFrame) -> Union[str, DtypeObj]:
|
|
148
131
|
if self.op.output_type:
|
|
149
132
|
return self.op.output_type
|
|
150
133
|
else:
|
|
151
134
|
# either a symmetrical operator or group by
|
|
152
135
|
return self.children[0].infer_type(data)
|
|
153
136
|
|
|
154
|
-
def calculate(self, data, is_root=False):
|
|
137
|
+
def calculate(self, data: pd.DataFrame, is_root=False) -> Union[pd.Series, pd.DataFrame]:
|
|
155
138
|
if isinstance(self.op, PandasOperand) and self.op.is_vector:
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
139
|
+
if self.op.is_vector:
|
|
140
|
+
ds = [child.calculate(data) for child in self.children]
|
|
141
|
+
new_data = self.op.calculate(data=ds)
|
|
142
|
+
else:
|
|
143
|
+
d1 = self.children[0].calculate(data)
|
|
144
|
+
d2 = None if len(self.children) < 2 else self.children[1].calculate(data)
|
|
145
|
+
new_data = self.op.calculate(data=d1, left=d1, right=d2)
|
|
163
146
|
else:
|
|
164
147
|
raise NotImplementedError(f"Unrecognized operator {self.op.name}.")
|
|
165
148
|
|
|
@@ -173,8 +156,8 @@ class Feature(object):
|
|
|
173
156
|
return new_data
|
|
174
157
|
|
|
175
158
|
@staticmethod
|
|
176
|
-
def check_xor(left, right):
|
|
177
|
-
def _get_all_columns(feature):
|
|
159
|
+
def check_xor(left: Union[Column, "Feature"], right: Union[Column, "Feature"]) -> bool:
|
|
160
|
+
def _get_all_columns(feature: Union[Column, "Feature"]) -> List[str]:
|
|
178
161
|
if isinstance(feature, Column):
|
|
179
162
|
return [feature.name]
|
|
180
163
|
else:
|
|
@@ -190,7 +173,7 @@ class Feature(object):
|
|
|
190
173
|
else:
|
|
191
174
|
return True
|
|
192
175
|
|
|
193
|
-
def to_formula(self, **kwargs):
|
|
176
|
+
def to_formula(self, **kwargs) -> str:
|
|
194
177
|
if self.op.name in ["+", "-", "*", "/"]:
|
|
195
178
|
left = self.children[0].to_formula(**kwargs)
|
|
196
179
|
right = self.children[1].to_formula(**kwargs)
|
|
@@ -205,15 +188,30 @@ class Feature(object):
|
|
|
205
188
|
result.append(")")
|
|
206
189
|
return "".join(result)
|
|
207
190
|
|
|
191
|
+
def to_pretty_formula(self) -> str:
|
|
192
|
+
if self.op.name in ["+", "-", "*", "/"]:
|
|
193
|
+
left = self.children[0].to_pretty_formula()
|
|
194
|
+
right = self.children[1].to_pretty_formula()
|
|
195
|
+
return f"{left} {self.op.name} {right}"
|
|
196
|
+
else:
|
|
197
|
+
result = [self.op.name, "("]
|
|
198
|
+
for i in range(len(self.children)):
|
|
199
|
+
string_i = self.children[i].to_pretty_formula()
|
|
200
|
+
result.append(string_i)
|
|
201
|
+
result.append(", ")
|
|
202
|
+
result.pop()
|
|
203
|
+
result.append(")")
|
|
204
|
+
return "".join(result)
|
|
205
|
+
|
|
208
206
|
@staticmethod
|
|
209
|
-
def from_formula(string):
|
|
207
|
+
def from_formula(string: str) -> Union[Column, "Feature"]:
|
|
210
208
|
if string[-1] != ")":
|
|
211
209
|
return Column(string)
|
|
212
210
|
|
|
213
|
-
def is_trivial_char(c):
|
|
211
|
+
def is_trivial_char(c: str) -> bool:
|
|
214
212
|
return not (c in "()+-*/,")
|
|
215
213
|
|
|
216
|
-
def find_prev(string):
|
|
214
|
+
def find_prev(string: str) -> int:
|
|
217
215
|
if string[-1] != ")":
|
|
218
216
|
return max([(0 if is_trivial_char(c) else i + 1) for i, c in enumerate(string)])
|
|
219
217
|
level, pos = 0, -1
|
|
@@ -259,40 +257,65 @@ class Feature(object):
|
|
|
259
257
|
return Feature(op, base_features)
|
|
260
258
|
|
|
261
259
|
|
|
262
|
-
class
|
|
263
|
-
def __init__(self,
|
|
264
|
-
|
|
265
|
-
self.
|
|
266
|
-
self.
|
|
260
|
+
class FeatureGroup:
|
|
261
|
+
def __init__(self, op: Operand, main_column: Optional[Union[Column, Feature]],
|
|
262
|
+
children: List[Union[Column, Feature]]):
|
|
263
|
+
self.op = op
|
|
264
|
+
self.main_column_node = main_column
|
|
265
|
+
self.children = children
|
|
266
|
+
self.data: Optional[pd.DataFrame] = None
|
|
267
267
|
|
|
268
|
-
def
|
|
269
|
-
|
|
270
|
-
|
|
268
|
+
def get_columns(self, **kwargs) -> List[str]:
|
|
269
|
+
column_list = []
|
|
270
|
+
seen = set()
|
|
271
|
+
for child in self.children:
|
|
272
|
+
columns = child.get_columns(**kwargs)
|
|
273
|
+
column_list.extend([f for f in columns if f not in seen])
|
|
274
|
+
seen.update(columns)
|
|
275
|
+
return column_list
|
|
271
276
|
|
|
272
|
-
def
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
277
|
+
def get_display_names(self, **kwargs) -> List[str]:
|
|
278
|
+
names = [f.get_display_name(**kwargs) for f in self.children]
|
|
279
|
+
return names
|
|
280
|
+
|
|
281
|
+
def calculate(self, data: pd.DataFrame, is_root=False) -> pd.DataFrame:
|
|
282
|
+
main_column = None if self.main_column_node is None else self.main_column_node.get_columns()[0]
|
|
283
|
+
if isinstance(self.op, PandasOperand):
|
|
284
|
+
columns = self.get_columns()
|
|
285
|
+
new_data = self.op.calculate_group(data[columns], main_column=main_column)
|
|
286
|
+
new_data.rename(columns=dict(zip(columns, self.get_display_names())), inplace=True)
|
|
278
287
|
else:
|
|
279
|
-
|
|
288
|
+
raise NotImplementedError(f"Unrecognized operator {self.op.name}.")
|
|
280
289
|
|
|
281
|
-
|
|
282
|
-
self.data = None
|
|
290
|
+
new_data.replace([-np.inf, np.inf], np.nan, inplace=True)
|
|
283
291
|
|
|
284
|
-
|
|
285
|
-
|
|
292
|
+
if is_root:
|
|
293
|
+
self.data = new_data
|
|
294
|
+
return new_data
|
|
286
295
|
|
|
287
|
-
|
|
288
|
-
|
|
296
|
+
@staticmethod
|
|
297
|
+
def make_groups(candidates: List[Feature]) -> List[Union[Feature, "FeatureGroup"]]:
|
|
298
|
+
grouped_features = []
|
|
289
299
|
|
|
290
|
-
|
|
291
|
-
|
|
300
|
+
def groupby_func(f: Feature) -> Tuple[Operand, Union[Column, Feature]]:
|
|
301
|
+
return (f.op, f.children[0] if f.op.is_unary or f.op.is_vector else f.children[1])
|
|
292
302
|
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
303
|
+
for op_child, features in itertools.groupby(candidates, groupby_func):
|
|
304
|
+
op, main_child = op_child
|
|
305
|
+
feature_list = list(features)
|
|
306
|
+
if op.is_vectorizable:
|
|
307
|
+
if op.is_unary:
|
|
308
|
+
group = FeatureGroup(op, main_column=None, children=feature_list)
|
|
309
|
+
else:
|
|
310
|
+
group = FeatureGroup(op, main_column=main_child, children=feature_list)
|
|
311
|
+
grouped_features.append(group)
|
|
312
|
+
else:
|
|
313
|
+
grouped_features.extend(feature_list)
|
|
314
|
+
return grouped_features
|
|
296
315
|
|
|
297
|
-
def
|
|
298
|
-
|
|
316
|
+
def delete_data(self):
|
|
317
|
+
self.data = None
|
|
318
|
+
if self.main_column_node:
|
|
319
|
+
self.main_column_node.delete_data()
|
|
320
|
+
for child in self.children:
|
|
321
|
+
child.delete_data()
|
|
@@ -2921,6 +2921,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2921
2921
|
if feature_meta is None:
|
|
2922
2922
|
self.logger.warning(f"Feature meta for display index {m.display_index} not found")
|
|
2923
2923
|
continue
|
|
2924
|
+
description["shap"] = feature_meta.shap_value
|
|
2924
2925
|
description["Sources"] = feature_meta.data_source\
|
|
2925
2926
|
.replace("AutoFE: features from ", "")\
|
|
2926
2927
|
.replace("AutoFE: feature from ", "")
|
|
@@ -2940,6 +2941,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2940
2941
|
|
|
2941
2942
|
descriptions_df = pd.DataFrame(descriptions)
|
|
2942
2943
|
descriptions_df.fillna("", inplace=True)
|
|
2944
|
+
descriptions_df.sort_values(by="shap", ascending=False, inplace=True)
|
|
2945
|
+
descriptions_df.drop(columns="shap", inplace=True)
|
|
2943
2946
|
return descriptions_df
|
|
2944
2947
|
except Exception:
|
|
2945
2948
|
self.logger.exception("Failed to generate AutoFE features description")
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|