upgini 1.1.228__py3-none-any.whl → 1.1.231__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/autofe/__init__.py +0 -0
- upgini/autofe/all_operands.py +43 -0
- upgini/autofe/binary.py +133 -0
- upgini/autofe/feature.py +298 -0
- upgini/autofe/groupby.py +82 -0
- upgini/autofe/operand.py +70 -0
- upgini/autofe/unary.py +105 -0
- upgini/autofe/vector.py +20 -0
- upgini/features_enricher.py +20 -11
- upgini/metadata.py +3 -2
- upgini/utils/cv_utils.py +1 -1
- {upgini-1.1.228.dist-info → upgini-1.1.231.dist-info}/METADATA +1 -1
- {upgini-1.1.228.dist-info → upgini-1.1.231.dist-info}/RECORD +16 -8
- {upgini-1.1.228.dist-info → upgini-1.1.231.dist-info}/LICENSE +0 -0
- {upgini-1.1.228.dist-info → upgini-1.1.231.dist-info}/WHEEL +0 -0
- {upgini-1.1.228.dist-info → upgini-1.1.231.dist-info}/top_level.txt +0 -0
|
File without changes
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
from typing import Dict
|
|
2
|
+
from upgini.autofe.groupby import GroupByThenAgg, GroupByThenRank
|
|
3
|
+
from upgini.autofe.operand import Operand
|
|
4
|
+
from upgini.autofe.unary import Abs, Log, Residual, Sqrt, Square, Sigmoid, Floor, Freq
|
|
5
|
+
from upgini.autofe.binary import Min, Max, Add, Subtract, Multiply, Divide, Sim
|
|
6
|
+
from upgini.autofe.vector import Mean, Sum
|
|
7
|
+
|
|
8
|
+
ALL_OPERANDS: Dict[str, Operand] = {
|
|
9
|
+
op.name: op
|
|
10
|
+
for op in [
|
|
11
|
+
Freq(),
|
|
12
|
+
Mean(),
|
|
13
|
+
Sum(),
|
|
14
|
+
Abs(),
|
|
15
|
+
Log(),
|
|
16
|
+
Sqrt(),
|
|
17
|
+
Square(),
|
|
18
|
+
Sigmoid(),
|
|
19
|
+
Floor(),
|
|
20
|
+
Residual(),
|
|
21
|
+
Min(),
|
|
22
|
+
Max(),
|
|
23
|
+
Add(),
|
|
24
|
+
Subtract(),
|
|
25
|
+
Multiply(),
|
|
26
|
+
Divide(),
|
|
27
|
+
GroupByThenAgg(name="GroupByThenMin", agg="min"),
|
|
28
|
+
GroupByThenAgg(name="GroupByThenMax", agg="max"),
|
|
29
|
+
GroupByThenAgg(name="GroupByThenMean", agg="mean"),
|
|
30
|
+
GroupByThenAgg(name="GroupByThenMedian", agg="median"),
|
|
31
|
+
GroupByThenAgg(name="GroupByThenStd", output_type="float", agg="std"),
|
|
32
|
+
GroupByThenRank(),
|
|
33
|
+
Operand(name="Combine", has_symmetry_importance=True, output_type="object", is_categorical=True),
|
|
34
|
+
Operand(name="CombineThenFreq", has_symmetry_importance=True, output_type="float"),
|
|
35
|
+
Operand(name="GroupByThenNUnique", output_type="int", is_vectorizable=True, is_grouping=True),
|
|
36
|
+
Operand(name="GroupByThenFreq", output_type="float", is_grouping=True),
|
|
37
|
+
Sim(),
|
|
38
|
+
]
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def find_op(name):
|
|
43
|
+
return ALL_OPERANDS.get(name)
|
upgini/autofe/binary.py
ADDED
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
from upgini.autofe.operand import PandasOperand, VectorizableMixin
|
|
2
|
+
import numpy as np
|
|
3
|
+
import pandas as pd
|
|
4
|
+
from numpy import dot
|
|
5
|
+
from numpy.linalg import norm
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class Min(PandasOperand):
|
|
9
|
+
name = "min"
|
|
10
|
+
is_binary = True
|
|
11
|
+
has_symmetry_importance = True
|
|
12
|
+
|
|
13
|
+
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
14
|
+
return np.minimum(left, right)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class Max(PandasOperand):
|
|
18
|
+
name = "max"
|
|
19
|
+
is_binary = True
|
|
20
|
+
has_symmetry_importance = True
|
|
21
|
+
|
|
22
|
+
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
23
|
+
return np.maximum(left, right)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class Add(PandasOperand, VectorizableMixin):
|
|
27
|
+
name = "+"
|
|
28
|
+
alias = "add"
|
|
29
|
+
is_binary = True
|
|
30
|
+
has_symmetry_importance = True
|
|
31
|
+
is_vectorizable = True
|
|
32
|
+
|
|
33
|
+
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
34
|
+
return left + right
|
|
35
|
+
|
|
36
|
+
def calculate_group(self, data: pd.DataFrame, **kwargs) -> pd.DataFrame:
|
|
37
|
+
group_column, value_columns = self.validate_calculation(data.columns, **kwargs)
|
|
38
|
+
d1 = data[value_columns]
|
|
39
|
+
d2 = data[group_column]
|
|
40
|
+
|
|
41
|
+
return d1.add(d2, axis=0)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class Subtract(PandasOperand, VectorizableMixin):
|
|
45
|
+
name = "-"
|
|
46
|
+
alias = "sub"
|
|
47
|
+
is_binary = True
|
|
48
|
+
has_symmetry_importance = True
|
|
49
|
+
is_vectorizable = True
|
|
50
|
+
|
|
51
|
+
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
52
|
+
return left - right
|
|
53
|
+
|
|
54
|
+
def calculate_group(self, data: pd.DataFrame, **kwargs) -> pd.DataFrame:
|
|
55
|
+
group_column, value_columns = self.validate_calculation(data.columns, **kwargs)
|
|
56
|
+
d1 = data[value_columns]
|
|
57
|
+
d2 = data[group_column]
|
|
58
|
+
|
|
59
|
+
return d1.sub(d2, axis=0)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class Multiply(PandasOperand, VectorizableMixin):
|
|
63
|
+
name = "*"
|
|
64
|
+
alias = "mul"
|
|
65
|
+
is_binary = True
|
|
66
|
+
has_symmetry_importance = True
|
|
67
|
+
is_vectorizable = True
|
|
68
|
+
|
|
69
|
+
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
70
|
+
return left * right
|
|
71
|
+
|
|
72
|
+
def calculate_group(self, data: pd.DataFrame, **kwargs) -> pd.DataFrame:
|
|
73
|
+
group_column, value_columns = self.validate_calculation(data.columns, **kwargs)
|
|
74
|
+
d1 = data[value_columns]
|
|
75
|
+
d2 = data[group_column]
|
|
76
|
+
|
|
77
|
+
return d1.mul(d2, axis=0)
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
class Divide(PandasOperand, VectorizableMixin):
|
|
81
|
+
name = "/"
|
|
82
|
+
alias = "div"
|
|
83
|
+
is_binary = True
|
|
84
|
+
has_symmetry_importance = True
|
|
85
|
+
is_vectorizable = True
|
|
86
|
+
output_type = "float"
|
|
87
|
+
|
|
88
|
+
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
89
|
+
return left / right.replace(0, np.nan)
|
|
90
|
+
|
|
91
|
+
def calculate_group(self, data: pd.DataFrame, **kwargs) -> pd.DataFrame:
|
|
92
|
+
group_column, value_columns = self.validate_calculation(data.columns, **kwargs)
|
|
93
|
+
d1 = data[value_columns]
|
|
94
|
+
d2 = data[group_column]
|
|
95
|
+
|
|
96
|
+
return d1.div(d2.replace(0, np.nan), axis=0)
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
class Combine(PandasOperand):
|
|
100
|
+
name = "Combine"
|
|
101
|
+
is_binary = True
|
|
102
|
+
has_symmetry_importance = True
|
|
103
|
+
output_type = "object"
|
|
104
|
+
|
|
105
|
+
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
106
|
+
temp = left.astype(str) + "_" + right.astype(str)
|
|
107
|
+
temp[left.isna() | right.isna()] = np.nan
|
|
108
|
+
return pd.Series(temp, index=left.index)
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
class CombineThenFreq(PandasOperand):
|
|
112
|
+
name = "CombineThenFreq"
|
|
113
|
+
is_binary = True
|
|
114
|
+
has_symmetry_importance = True
|
|
115
|
+
output_type = "float"
|
|
116
|
+
is_distribution_dependent = True
|
|
117
|
+
input_type = "discrete"
|
|
118
|
+
|
|
119
|
+
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
120
|
+
temp = left.astype(str) + "_" + right.astype(str)
|
|
121
|
+
temp[left.isna() | right.isna()] = np.nan
|
|
122
|
+
value_counts = temp.value_counts(normalize=True)
|
|
123
|
+
self._loc(temp, value_counts)
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
class Sim(PandasOperand):
|
|
127
|
+
name = "sim"
|
|
128
|
+
is_binary = True
|
|
129
|
+
output_type = "float"
|
|
130
|
+
has_symmetry_importance = True
|
|
131
|
+
|
|
132
|
+
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
133
|
+
return dot(left, right) / (norm(left) * norm(right))
|
upgini/autofe/feature.py
ADDED
|
@@ -0,0 +1,298 @@
|
|
|
1
|
+
import hashlib
|
|
2
|
+
from typing import Dict
|
|
3
|
+
import numpy as np
|
|
4
|
+
import pandas as pd
|
|
5
|
+
import itertools
|
|
6
|
+
from upgini.autofe.operand import PandasOperand
|
|
7
|
+
from upgini.autofe.all_operands import (
|
|
8
|
+
find_op,
|
|
9
|
+
)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class FeatureGroup(object):
|
|
13
|
+
def __init__(self, op, main_column, children):
|
|
14
|
+
self.op = op
|
|
15
|
+
self.main_column_node = main_column
|
|
16
|
+
self.children = children
|
|
17
|
+
self.data = None
|
|
18
|
+
|
|
19
|
+
def get_columns(self, **kwargs):
|
|
20
|
+
column_list = []
|
|
21
|
+
seen = set()
|
|
22
|
+
for child in self.children:
|
|
23
|
+
columns = child.get_columns(**kwargs)
|
|
24
|
+
column_list.extend([f for f in columns if f not in seen])
|
|
25
|
+
seen.update(columns)
|
|
26
|
+
return column_list
|
|
27
|
+
|
|
28
|
+
def get_display_names(self, **kwargs):
|
|
29
|
+
names = [f.get_display_name(**kwargs) for f in self.children]
|
|
30
|
+
return names
|
|
31
|
+
|
|
32
|
+
def calculate(self, data: pd.DataFrame, is_root=False):
|
|
33
|
+
main_column = None if self.main_column_node is None else self.main_column_node.get_columns()[0]
|
|
34
|
+
if isinstance(self.op, PandasOperand):
|
|
35
|
+
columns = self.get_columns()
|
|
36
|
+
new_data = self.op.calculate_group(data[columns], main_column=main_column)
|
|
37
|
+
new_data.rename(columns=dict(zip(columns, self.get_display_names())), inplace=True)
|
|
38
|
+
|
|
39
|
+
else:
|
|
40
|
+
raise NotImplementedError(f"Unrecognized operator {self.op.name}.")
|
|
41
|
+
|
|
42
|
+
new_data.replace([-np.inf, np.inf], np.nan, inplace=True)
|
|
43
|
+
|
|
44
|
+
if is_root:
|
|
45
|
+
self.data = new_data
|
|
46
|
+
return new_data
|
|
47
|
+
|
|
48
|
+
@staticmethod
|
|
49
|
+
def make_groups(candidates):
|
|
50
|
+
grouped_features = []
|
|
51
|
+
for op_child, features in itertools.groupby(
|
|
52
|
+
candidates, lambda f: (f.op, f.children[0] if f.op.is_unary or f.op.is_vector else f.children[1])
|
|
53
|
+
):
|
|
54
|
+
op, main_child = op_child
|
|
55
|
+
feature_list = list(features)
|
|
56
|
+
if op.is_vectorizable:
|
|
57
|
+
if op.is_unary:
|
|
58
|
+
group = FeatureGroup(op, main_column=None, children=feature_list)
|
|
59
|
+
else:
|
|
60
|
+
group = FeatureGroup(op, main_column=main_child, children=feature_list)
|
|
61
|
+
grouped_features.append(group)
|
|
62
|
+
else:
|
|
63
|
+
grouped_features.extend(feature_list)
|
|
64
|
+
return grouped_features
|
|
65
|
+
|
|
66
|
+
def delete_data(self):
|
|
67
|
+
self.data = None
|
|
68
|
+
if self.main_column_node:
|
|
69
|
+
self.main_column_node.delete_data()
|
|
70
|
+
for child in self.children:
|
|
71
|
+
child.delete_data()
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
class Feature(object):
|
|
75
|
+
def __init__(self, op, children, data=None, display_index=None, cached_display_name=None, alias=None):
|
|
76
|
+
self.op = op
|
|
77
|
+
self.children = children
|
|
78
|
+
self.data = data
|
|
79
|
+
self.display_index = display_index
|
|
80
|
+
self.cached_display_name = cached_display_name
|
|
81
|
+
self.alias = alias
|
|
82
|
+
|
|
83
|
+
def set_op_params(self, params: Dict):
|
|
84
|
+
self.op.set_params(params)
|
|
85
|
+
return self
|
|
86
|
+
|
|
87
|
+
def get_hash(self):
|
|
88
|
+
return hashlib.sha256("_".join([self.op.name] + [ch.name for ch in self.children]).encode("utf-8")).hexdigest()[
|
|
89
|
+
:8
|
|
90
|
+
]
|
|
91
|
+
|
|
92
|
+
def set_alias(self, alias):
|
|
93
|
+
self.alias = alias
|
|
94
|
+
return self
|
|
95
|
+
|
|
96
|
+
def rename_columns(self, mapping: Dict):
|
|
97
|
+
for child in self.children:
|
|
98
|
+
child.rename_columns(mapping)
|
|
99
|
+
self.cached_display_name = None
|
|
100
|
+
return self
|
|
101
|
+
|
|
102
|
+
def get_column_nodes(self):
|
|
103
|
+
res = []
|
|
104
|
+
for child in self.children:
|
|
105
|
+
res.extend(child.get_column_nodes())
|
|
106
|
+
return res
|
|
107
|
+
|
|
108
|
+
def get_columns(self, **kwargs):
|
|
109
|
+
column_list = []
|
|
110
|
+
seen = set()
|
|
111
|
+
for child in self.children:
|
|
112
|
+
columns = child.get_columns(**kwargs)
|
|
113
|
+
column_list.extend([f for f in columns if f not in seen])
|
|
114
|
+
seen.update(columns)
|
|
115
|
+
return column_list
|
|
116
|
+
|
|
117
|
+
def delete_data(self):
|
|
118
|
+
self.data = None
|
|
119
|
+
for child in self.children:
|
|
120
|
+
child.delete_data()
|
|
121
|
+
|
|
122
|
+
def get_display_name(self, cache: bool = True, shorten: bool = False, **kwargs):
|
|
123
|
+
if self.cached_display_name is not None and cache:
|
|
124
|
+
return self.cached_display_name
|
|
125
|
+
|
|
126
|
+
if self.alias:
|
|
127
|
+
components = ["f_autofe", self.alias]
|
|
128
|
+
elif shorten and not self.op.is_unary:
|
|
129
|
+
components = ["f_autofe", self.op.alias or self.op.name.lower()]
|
|
130
|
+
else:
|
|
131
|
+
components = ["f_" + "_f_".join(self.get_columns(**kwargs))] + [
|
|
132
|
+
"autofe",
|
|
133
|
+
self.op.alias or self.op.name.lower(),
|
|
134
|
+
]
|
|
135
|
+
components.extend([str(self.display_index)] if self.display_index is not None else [])
|
|
136
|
+
display_name = "_".join(components)
|
|
137
|
+
|
|
138
|
+
if cache:
|
|
139
|
+
self.cached_display_name = display_name
|
|
140
|
+
return display_name
|
|
141
|
+
|
|
142
|
+
def set_display_index(self, index):
|
|
143
|
+
self.display_index = index
|
|
144
|
+
self.cached_display_name = None
|
|
145
|
+
return self
|
|
146
|
+
|
|
147
|
+
def infer_type(self, data):
|
|
148
|
+
if self.op.output_type:
|
|
149
|
+
return self.op.output_type
|
|
150
|
+
else:
|
|
151
|
+
# either a symmetrical operator or group by
|
|
152
|
+
return self.children[0].infer_type(data)
|
|
153
|
+
|
|
154
|
+
def calculate(self, data, is_root=False):
|
|
155
|
+
if isinstance(self.op, PandasOperand) and self.op.is_vector:
|
|
156
|
+
ds = [child.calculate(data) for child in self.children]
|
|
157
|
+
new_data = self.op.calculate(data=ds)
|
|
158
|
+
|
|
159
|
+
elif isinstance(self.op, PandasOperand):
|
|
160
|
+
d1 = self.children[0].calculate(data)
|
|
161
|
+
d2 = None if len(self.children) < 2 else self.children[1].calculate(data)
|
|
162
|
+
new_data = self.op.calculate(data=d1, left=d1, right=d2)
|
|
163
|
+
else:
|
|
164
|
+
raise NotImplementedError(f"Unrecognized operator {self.op.name}.")
|
|
165
|
+
|
|
166
|
+
if (str(new_data.dtype) == "category") | (str(new_data.dtype) == "object"):
|
|
167
|
+
pass
|
|
168
|
+
else:
|
|
169
|
+
new_data = new_data.replace([-np.inf, np.inf], np.nan)
|
|
170
|
+
|
|
171
|
+
if is_root:
|
|
172
|
+
self.data = new_data
|
|
173
|
+
return new_data
|
|
174
|
+
|
|
175
|
+
@staticmethod
|
|
176
|
+
def check_xor(left, right):
|
|
177
|
+
def _get_all_columns(feature):
|
|
178
|
+
if isinstance(feature, Column):
|
|
179
|
+
return [feature.name]
|
|
180
|
+
else:
|
|
181
|
+
res = []
|
|
182
|
+
for child in feature.children:
|
|
183
|
+
res.extend(_get_all_columns(child))
|
|
184
|
+
return res
|
|
185
|
+
|
|
186
|
+
column1 = set(_get_all_columns(left))
|
|
187
|
+
column2 = set(_get_all_columns(right))
|
|
188
|
+
if len(column1 ^ column2) == 0:
|
|
189
|
+
return False
|
|
190
|
+
else:
|
|
191
|
+
return True
|
|
192
|
+
|
|
193
|
+
def to_formula(self, **kwargs):
|
|
194
|
+
if self.op.name in ["+", "-", "*", "/"]:
|
|
195
|
+
left = self.children[0].to_formula(**kwargs)
|
|
196
|
+
right = self.children[1].to_formula(**kwargs)
|
|
197
|
+
return f"({left}{self.op.name}{right})"
|
|
198
|
+
else:
|
|
199
|
+
result = [self.op.name, "("]
|
|
200
|
+
for i in range(len(self.children)):
|
|
201
|
+
string_i = self.children[i].to_formula(**kwargs)
|
|
202
|
+
result.append(string_i)
|
|
203
|
+
result.append(",")
|
|
204
|
+
result.pop()
|
|
205
|
+
result.append(")")
|
|
206
|
+
return "".join(result)
|
|
207
|
+
|
|
208
|
+
@staticmethod
|
|
209
|
+
def from_formula(string):
|
|
210
|
+
if string[-1] != ")":
|
|
211
|
+
return Column(string)
|
|
212
|
+
|
|
213
|
+
def is_trivial_char(c):
|
|
214
|
+
return not (c in "()+-*/,")
|
|
215
|
+
|
|
216
|
+
def find_prev(string):
|
|
217
|
+
if string[-1] != ")":
|
|
218
|
+
return max([(0 if is_trivial_char(c) else i + 1) for i, c in enumerate(string)])
|
|
219
|
+
level, pos = 0, -1
|
|
220
|
+
for i in range(len(string) - 1, -1, -1):
|
|
221
|
+
if string[i] == ")":
|
|
222
|
+
level += 1
|
|
223
|
+
if string[i] == "(":
|
|
224
|
+
level -= 1
|
|
225
|
+
if level == 0:
|
|
226
|
+
pos = i
|
|
227
|
+
break
|
|
228
|
+
while (pos > 0) and is_trivial_char(string[pos - 1]):
|
|
229
|
+
pos -= 1
|
|
230
|
+
return pos
|
|
231
|
+
|
|
232
|
+
p2 = find_prev(string[:-1])
|
|
233
|
+
if string[p2 - 1] == "(":
|
|
234
|
+
return Feature(find_op(string[: p2 - 1]), [Feature.from_formula(string[p2:-1])])
|
|
235
|
+
p1 = find_prev(string[: p2 - 1])
|
|
236
|
+
if string[0] == "(":
|
|
237
|
+
return Feature(
|
|
238
|
+
find_op(string[p2 - 1]),
|
|
239
|
+
[Feature.from_formula(string[p1 : p2 - 1]), Feature.from_formula(string[p2:-1])],
|
|
240
|
+
)
|
|
241
|
+
else:
|
|
242
|
+
op = find_op(string[: p1 - 1])
|
|
243
|
+
if op is not None:
|
|
244
|
+
return Feature(
|
|
245
|
+
op,
|
|
246
|
+
[Feature.from_formula(string[p1 : p2 - 1]), Feature.from_formula(string[p2:-1])],
|
|
247
|
+
)
|
|
248
|
+
else:
|
|
249
|
+
base_features = [
|
|
250
|
+
Feature.from_formula(string[p2:-1]),
|
|
251
|
+
Feature.from_formula(string[p1 : p2 - 1]),
|
|
252
|
+
]
|
|
253
|
+
while op is None:
|
|
254
|
+
p2 = p1
|
|
255
|
+
p1 = find_prev(string[: p1 - 1])
|
|
256
|
+
base_features.append(Feature.from_formula(string[p1 : p2 - 1]))
|
|
257
|
+
op = find_op(string[: p1 - 1])
|
|
258
|
+
base_features.reverse()
|
|
259
|
+
return Feature(op, base_features)
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
class Column(object):
|
|
263
|
+
def __init__(self, name, data=None, calculate_all=False):
|
|
264
|
+
self.name = name
|
|
265
|
+
self.data = data
|
|
266
|
+
self.calculate_all = calculate_all
|
|
267
|
+
|
|
268
|
+
def rename_columns(self, mapping: Dict):
|
|
269
|
+
self.name = self._unhash(mapping.get(self.name) or self.name)
|
|
270
|
+
return self
|
|
271
|
+
|
|
272
|
+
def _unhash(self, feature_name):
|
|
273
|
+
last_component_idx = feature_name.rfind("_")
|
|
274
|
+
if not feature_name.startswith("f_"):
|
|
275
|
+
return feature_name # etalon feature
|
|
276
|
+
elif last_component_idx == 1:
|
|
277
|
+
return feature_name[2:] # fully hashed name, cannot unhash
|
|
278
|
+
else:
|
|
279
|
+
return feature_name[2:last_component_idx]
|
|
280
|
+
|
|
281
|
+
def delete_data(self):
|
|
282
|
+
self.data = None
|
|
283
|
+
|
|
284
|
+
def get_column_nodes(self):
|
|
285
|
+
return [self]
|
|
286
|
+
|
|
287
|
+
def get_columns(self):
|
|
288
|
+
return [self.name]
|
|
289
|
+
|
|
290
|
+
def infer_type(self, data):
|
|
291
|
+
return data[self.name].dtype
|
|
292
|
+
|
|
293
|
+
def calculate(self, data):
|
|
294
|
+
self.data = data[self.name]
|
|
295
|
+
return self.data
|
|
296
|
+
|
|
297
|
+
def to_formula(self, **kwargs):
|
|
298
|
+
return str(self.get_columns(**kwargs)[0])
|
upgini/autofe/groupby.py
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
from upgini.autofe.operand import PandasOperand, VectorizableMixin
|
|
2
|
+
from typing import Optional
|
|
3
|
+
import pandas as pd
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class GroupByThenAgg(PandasOperand, VectorizableMixin):
|
|
7
|
+
agg: Optional[str]
|
|
8
|
+
is_vectorizable = True
|
|
9
|
+
is_grouping = True
|
|
10
|
+
is_distribution_dependent = True
|
|
11
|
+
|
|
12
|
+
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
13
|
+
temp = left.groupby(right).agg(self.agg)
|
|
14
|
+
return self._loc(right, temp)
|
|
15
|
+
|
|
16
|
+
def calculate_group(self, data: pd.DataFrame, **kwargs) -> pd.DataFrame:
|
|
17
|
+
group_column, value_columns = self.validate_calculation(data.columns, **kwargs)
|
|
18
|
+
d1 = data[value_columns]
|
|
19
|
+
d2 = data[group_column]
|
|
20
|
+
temp = d1.groupby(d2).agg(self.agg)
|
|
21
|
+
return temp.merge(d2, how="right", on=[group_column])[value_columns]
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class GroupByThenMedian(GroupByThenAgg):
|
|
25
|
+
name = "GroupByThenMedian"
|
|
26
|
+
pandas_agg = "median"
|
|
27
|
+
is_distribution_dependent = True
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class GroupByThenRank(PandasOperand, VectorizableMixin):
|
|
31
|
+
name = "GroupByThenRank"
|
|
32
|
+
is_vectorizable = True
|
|
33
|
+
is_grouping = True
|
|
34
|
+
output_type = "float"
|
|
35
|
+
is_distribution_dependent = True
|
|
36
|
+
|
|
37
|
+
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
38
|
+
temp = pd.DataFrame(left[~right.isna()].groupby(right).rank(ascending=True, pct=True)).reset_index()
|
|
39
|
+
return temp.merge(pd.DataFrame(right).reset_index(), how="right", on=["index"])[left.name]
|
|
40
|
+
|
|
41
|
+
def calculate_group(self, data: pd.DataFrame, **kwargs) -> pd.DataFrame:
|
|
42
|
+
group_column, value_columns = self.validate_calculation(data.columns, **kwargs)
|
|
43
|
+
d1 = data[value_columns]
|
|
44
|
+
d2 = data[group_column]
|
|
45
|
+
temp = d1[~d2.isna()].groupby(d2).rank(ascending=True, pct=True)[value_columns].reset_index()
|
|
46
|
+
return temp.merge(d2.reset_index(), how="right", on=["index"])[value_columns]
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class GroupByThenNUnique(PandasOperand, VectorizableMixin):
|
|
50
|
+
name = "GroupByThenNUnique"
|
|
51
|
+
is_vectorizable = True
|
|
52
|
+
is_grouping = True
|
|
53
|
+
output_type = "int"
|
|
54
|
+
is_distribution_dependent = True
|
|
55
|
+
input_type = "discrete"
|
|
56
|
+
|
|
57
|
+
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
58
|
+
nunique = left.groupby(right).nunique()
|
|
59
|
+
return self._loc(right, nunique)
|
|
60
|
+
|
|
61
|
+
def calculate_group(self, data: pd.DataFrame, **kwargs) -> pd.DataFrame:
|
|
62
|
+
group_column, value_columns = self.validate_calculation(data.columns, **kwargs)
|
|
63
|
+
d1 = data[value_columns]
|
|
64
|
+
d2 = data[group_column]
|
|
65
|
+
nunique = d1.groupby(d2).nunique()
|
|
66
|
+
return nunique.merge(d2, how="right", on=[group_column])[value_columns]
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
class GroupByThenFreq(PandasOperand):
|
|
70
|
+
name = "GroupByThenFreq"
|
|
71
|
+
is_grouping = True
|
|
72
|
+
output_type = "float"
|
|
73
|
+
is_distribution_dependent = True
|
|
74
|
+
input_type = "discrete"
|
|
75
|
+
|
|
76
|
+
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
77
|
+
def _f(x):
|
|
78
|
+
value_counts = x.value_counts(normalize=True)
|
|
79
|
+
return self._loc(x, value_counts)
|
|
80
|
+
|
|
81
|
+
freq = left.groupby(right).apply(_f)
|
|
82
|
+
return pd.Series(freq, index=right.index)
|
upgini/autofe/operand.py
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
from pydantic import BaseModel
|
|
2
|
+
from typing import Dict, List, Optional, Tuple
|
|
3
|
+
import abc
|
|
4
|
+
import pandas as pd
|
|
5
|
+
import numpy as np
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class Operand(BaseModel):
|
|
9
|
+
name: str
|
|
10
|
+
alias: Optional[str]
|
|
11
|
+
is_unary: bool = False
|
|
12
|
+
has_symmetry_importance: bool = False
|
|
13
|
+
input_type: Optional[str]
|
|
14
|
+
output_type: Optional[str]
|
|
15
|
+
is_categorical: bool = False
|
|
16
|
+
is_vectorizable: bool = False
|
|
17
|
+
is_grouping: bool = False
|
|
18
|
+
is_binary: bool = False
|
|
19
|
+
is_vector: bool = False
|
|
20
|
+
is_distribution_dependent: bool = False
|
|
21
|
+
params: Optional[Dict[str, str]]
|
|
22
|
+
|
|
23
|
+
def set_params(self, params: Dict[str, str]):
|
|
24
|
+
self.params = params
|
|
25
|
+
return self
|
|
26
|
+
|
|
27
|
+
def get_params(self) -> Dict[str, str]:
|
|
28
|
+
return self.params
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
MAIN_COLUMN = "main_column"
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class PandasOperand(Operand, abc.ABC):
|
|
35
|
+
def calculate(self, **kwargs) -> pd.Series:
|
|
36
|
+
if self.is_unary:
|
|
37
|
+
return self.calculate_unary(kwargs["data"])
|
|
38
|
+
elif self.is_binary or self.is_grouping:
|
|
39
|
+
return self.calculate_binary(kwargs["left"], kwargs["right"])
|
|
40
|
+
else:
|
|
41
|
+
return self.calculate_vector(kwargs["data"])
|
|
42
|
+
|
|
43
|
+
def calculate_unary(self, data: pd.Series) -> pd.Series:
|
|
44
|
+
pass
|
|
45
|
+
|
|
46
|
+
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
47
|
+
pass
|
|
48
|
+
|
|
49
|
+
def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
|
|
50
|
+
pass
|
|
51
|
+
|
|
52
|
+
def calculate_group(self, data: pd.DataFrame, **kwargs) -> pd.DataFrame:
|
|
53
|
+
if not self.is_vectorizable:
|
|
54
|
+
raise RuntimeError(f"Cannot apply calculate_group: operator {self.name} is not vectorizable")
|
|
55
|
+
else:
|
|
56
|
+
raise RuntimeError(f"Unimplemented calculate_group for operator {self.name}")
|
|
57
|
+
|
|
58
|
+
def _loc(self, df_to, df_from):
|
|
59
|
+
df_from.loc[np.nan] = np.nan
|
|
60
|
+
return df_to.fillna(np.nan).apply(lambda x: df_from.loc[x])
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
class VectorizableMixin(Operand):
|
|
64
|
+
def validate_calculation(self, input_columns: List[str], **kwargs) -> Tuple[str, List[str]]:
|
|
65
|
+
if not kwargs.get(MAIN_COLUMN):
|
|
66
|
+
raise ValueError(f"Expected argument {MAIN_COLUMN} for grouping operator {self.name} not found")
|
|
67
|
+
group_column = kwargs[MAIN_COLUMN]
|
|
68
|
+
value_columns = [col for col in input_columns if col != group_column]
|
|
69
|
+
|
|
70
|
+
return group_column, value_columns
|
upgini/autofe/unary.py
ADDED
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
from upgini.autofe.operand import PandasOperand
|
|
2
|
+
import numpy as np
|
|
3
|
+
import pandas as pd
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class Abs(PandasOperand):
|
|
7
|
+
name = "abs"
|
|
8
|
+
is_unary = True
|
|
9
|
+
is_vectorizable = True
|
|
10
|
+
|
|
11
|
+
def calculate_unary(self, data: pd.Series) -> pd.Series:
|
|
12
|
+
return data.abs()
|
|
13
|
+
|
|
14
|
+
def calculate_group(self, data: pd.DataFrame, **kwargs) -> pd.DataFrame:
|
|
15
|
+
return data.abs()
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class Log(PandasOperand):
|
|
19
|
+
name = "log"
|
|
20
|
+
is_unary = True
|
|
21
|
+
is_vectorizable = True
|
|
22
|
+
output_type = "float"
|
|
23
|
+
|
|
24
|
+
def calculate_unary(self, data: pd.Series) -> pd.Series:
|
|
25
|
+
return np.log(np.abs(data.replace(0, np.nan)))
|
|
26
|
+
|
|
27
|
+
def calculate_group(self, data: pd.DataFrame, **kwargs) -> pd.DataFrame:
|
|
28
|
+
return np.log(data.replace(0, np.nan).abs())
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class Sqrt(PandasOperand):
|
|
32
|
+
name = "sqrt"
|
|
33
|
+
is_unary = True
|
|
34
|
+
is_vectorizable = True
|
|
35
|
+
output_type = "float"
|
|
36
|
+
|
|
37
|
+
def calculate_unary(self, data: pd.Series) -> pd.Series:
|
|
38
|
+
return np.sqrt(np.abs(data))
|
|
39
|
+
|
|
40
|
+
def calculate_group(self, data: pd.DataFrame, **kwargs) -> pd.DataFrame:
|
|
41
|
+
return np.sqrt(data.abs())
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class Square(PandasOperand):
|
|
45
|
+
name = "square"
|
|
46
|
+
is_unary = True
|
|
47
|
+
is_vectorizable = True
|
|
48
|
+
|
|
49
|
+
def calculate_unary(self, data: pd.Series) -> pd.Series:
|
|
50
|
+
return np.square(data)
|
|
51
|
+
|
|
52
|
+
def calculate_group(self, data: pd.DataFrame, **kwargs) -> pd.DataFrame:
|
|
53
|
+
return np.square(data)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
class Sigmoid(PandasOperand):
|
|
57
|
+
name = "sigmoid"
|
|
58
|
+
is_unary = True
|
|
59
|
+
is_vectorizable = True
|
|
60
|
+
output_type = "float"
|
|
61
|
+
|
|
62
|
+
def calculate_unary(self, data: pd.Series) -> pd.Series:
|
|
63
|
+
return 1 / (1 + np.exp(-data))
|
|
64
|
+
|
|
65
|
+
def calculate_group(self, data: pd.DataFrame, **kwargs) -> pd.DataFrame:
|
|
66
|
+
return 1 / (1 + np.exp(-data))
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
class Floor(PandasOperand):
|
|
70
|
+
name = "floor"
|
|
71
|
+
is_unary = True
|
|
72
|
+
is_vectorizable = True
|
|
73
|
+
output_type = "int"
|
|
74
|
+
input_type = "continuous"
|
|
75
|
+
|
|
76
|
+
def calculate_unary(self, data: pd.Series) -> pd.Series:
|
|
77
|
+
return np.floor(data)
|
|
78
|
+
|
|
79
|
+
def calculate_group(self, data: pd.DataFrame, **kwargs) -> pd.DataFrame:
|
|
80
|
+
return np.floor(data)
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
class Residual(PandasOperand):
|
|
84
|
+
name = "residual"
|
|
85
|
+
is_unary = True
|
|
86
|
+
is_vectorizable = True
|
|
87
|
+
input_type = "continuous"
|
|
88
|
+
|
|
89
|
+
def calculate_unary(self, data: pd.Series) -> pd.Series:
|
|
90
|
+
return data - np.floor(data)
|
|
91
|
+
|
|
92
|
+
def calculate_group(self, data: pd.DataFrame, **kwargs) -> pd.DataFrame:
|
|
93
|
+
return data - np.floor(data)
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
class Freq(PandasOperand):
|
|
97
|
+
name = "freq"
|
|
98
|
+
is_unary = True
|
|
99
|
+
output_type = "float"
|
|
100
|
+
is_distribution_dependent = True
|
|
101
|
+
input_type = "discrete"
|
|
102
|
+
|
|
103
|
+
def calculate_unary(self, data: pd.Series) -> pd.Series:
|
|
104
|
+
value_counts = data.value_counts(normalize=True)
|
|
105
|
+
return self._loc(data, value_counts)
|
upgini/autofe/vector.py
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
import pandas as pd
|
|
3
|
+
from upgini.autofe.operand import PandasOperand
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class Mean(PandasOperand):
|
|
7
|
+
name = "mean"
|
|
8
|
+
output_type = "float"
|
|
9
|
+
is_vector = True
|
|
10
|
+
|
|
11
|
+
def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
|
|
12
|
+
return pd.DataFrame(data).T.fillna(0).mean(axis=1)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class Sum(PandasOperand):
|
|
16
|
+
name = "sum"
|
|
17
|
+
is_vector = True
|
|
18
|
+
|
|
19
|
+
def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
|
|
20
|
+
return pd.DataFrame(data).T.fillna(0).sum(axis=1)
|
upgini/features_enricher.py
CHANGED
|
@@ -5,7 +5,6 @@ import logging
|
|
|
5
5
|
import numbers
|
|
6
6
|
import os
|
|
7
7
|
import pickle
|
|
8
|
-
import re
|
|
9
8
|
import sys
|
|
10
9
|
import tempfile
|
|
11
10
|
import time
|
|
@@ -22,6 +21,7 @@ from scipy.stats import ks_2samp
|
|
|
22
21
|
from sklearn.base import TransformerMixin
|
|
23
22
|
from sklearn.exceptions import NotFittedError
|
|
24
23
|
from sklearn.model_selection import BaseCrossValidator
|
|
24
|
+
from upgini.autofe.feature import Feature
|
|
25
25
|
|
|
26
26
|
from upgini.data_source.data_source_publisher import CommercialSchema
|
|
27
27
|
from upgini.dataset import Dataset
|
|
@@ -251,6 +251,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
251
251
|
raise e
|
|
252
252
|
|
|
253
253
|
self.runtime_parameters = runtime_parameters or RuntimeParameters()
|
|
254
|
+
self.runtime_parameters.properties["feature_generation_params.hash_index"] = True
|
|
254
255
|
self.date_format = date_format
|
|
255
256
|
self.random_state = random_state
|
|
256
257
|
self.detect_missing_search_keys = detect_missing_search_keys
|
|
@@ -904,6 +905,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
904
905
|
|
|
905
906
|
model_task_type = self.model_task_type or define_task(y_sorted, self.logger, silent=True)
|
|
906
907
|
_cv = cv or self.cv
|
|
908
|
+
if groups is None and _cv == CVType.group_k_fold:
|
|
909
|
+
self.logger.info("Replacing group_k_fold with k_fold as no groups were found")
|
|
910
|
+
_cv = CVType.k_fold
|
|
907
911
|
if not isinstance(_cv, BaseCrossValidator):
|
|
908
912
|
date_column = self._get_date_column(search_keys)
|
|
909
913
|
date_series = validated_X[date_column] if date_column is not None else None
|
|
@@ -1629,9 +1633,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1629
1633
|
c.originalName or c.name for c in file_metadata.columns if c.name in features_for_transform
|
|
1630
1634
|
]
|
|
1631
1635
|
features_section = (
|
|
1632
|
-
', "features": {'
|
|
1633
|
-
", ".join([f'"{feature}": "test_value"' for feature in original_features_for_transform])
|
|
1634
|
-
"}"
|
|
1636
|
+
', "features": {'
|
|
1637
|
+
+ ", ".join([f'"{feature}": "test_value"' for feature in original_features_for_transform])
|
|
1638
|
+
+ "}"
|
|
1635
1639
|
)
|
|
1636
1640
|
else:
|
|
1637
1641
|
features_section = ""
|
|
@@ -2269,7 +2273,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2269
2273
|
msg = bundle.get("multivariate_timeseries_detected")
|
|
2270
2274
|
self.__override_cv(CVType.blocked_time_series, msg, print_warning=False)
|
|
2271
2275
|
elif (
|
|
2272
|
-
|
|
2276
|
+
self.cv is None
|
|
2273
2277
|
and model_task_type != ModelTaskType.REGRESSION
|
|
2274
2278
|
and self._get_group_columns(self.fit_search_keys)
|
|
2275
2279
|
):
|
|
@@ -2907,13 +2911,20 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2907
2911
|
|
|
2908
2912
|
descriptions = []
|
|
2909
2913
|
for m in autofe_meta:
|
|
2914
|
+
autofe_feature = Feature.from_formula(m.formula)
|
|
2915
|
+
if autofe_feature.op.is_vector:
|
|
2916
|
+
continue
|
|
2917
|
+
|
|
2910
2918
|
description = dict()
|
|
2911
2919
|
|
|
2912
2920
|
feature_meta = get_feature_by_display_index(m.display_index)
|
|
2913
2921
|
if feature_meta is None:
|
|
2914
2922
|
self.logger.warning(f"Feature meta for display index {m.display_index} not found")
|
|
2915
2923
|
continue
|
|
2916
|
-
description["
|
|
2924
|
+
description["shap"] = feature_meta.shap_value
|
|
2925
|
+
description["Sources"] = feature_meta.data_source\
|
|
2926
|
+
.replace("AutoFE: features from ", "")\
|
|
2927
|
+
.replace("AutoFE: feature from ", "")
|
|
2917
2928
|
description["Feature name"] = feature_meta.name
|
|
2918
2929
|
|
|
2919
2930
|
feature_idx = 1
|
|
@@ -2921,11 +2932,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2921
2932
|
description[f"Feature {feature_idx}"] = bc.hashed_name
|
|
2922
2933
|
feature_idx += 1
|
|
2923
2934
|
|
|
2924
|
-
|
|
2925
|
-
if match is None:
|
|
2926
|
-
self.logger.warning(f"Failed to infer autofe function from name {feature_meta.name}")
|
|
2927
|
-
else:
|
|
2928
|
-
description["Function"] = match.group(1)
|
|
2935
|
+
description["Function"] = autofe_feature.op.name
|
|
2929
2936
|
|
|
2930
2937
|
descriptions.append(description)
|
|
2931
2938
|
|
|
@@ -2934,6 +2941,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2934
2941
|
|
|
2935
2942
|
descriptions_df = pd.DataFrame(descriptions)
|
|
2936
2943
|
descriptions_df.fillna("", inplace=True)
|
|
2944
|
+
descriptions_df.sort_values(by="shap", ascending=False, inplace=True)
|
|
2945
|
+
descriptions_df.drop(columns="shap", inplace=True)
|
|
2937
2946
|
return descriptions_df
|
|
2938
2947
|
except Exception:
|
|
2939
2948
|
self.logger.exception("Failed to generate AutoFE features description")
|
upgini/metadata.py
CHANGED
|
@@ -68,7 +68,7 @@ class SearchKey(Enum):
|
|
|
68
68
|
@staticmethod
|
|
69
69
|
def personal_keys() -> List["SearchKey"]:
|
|
70
70
|
return [SearchKey.EMAIL, SearchKey.HEM, SearchKey.IP, SearchKey.PHONE]
|
|
71
|
-
|
|
71
|
+
|
|
72
72
|
@staticmethod
|
|
73
73
|
def from_meaning_type(meaning_type: FileColumnMeaningType) -> "SearchKey":
|
|
74
74
|
if meaning_type == FileColumnMeaningType.EMAIL:
|
|
@@ -247,11 +247,12 @@ class BaseColumnMetadata(BaseModel):
|
|
|
247
247
|
original_name: str
|
|
248
248
|
hashed_name: str
|
|
249
249
|
ads_definition_id: Optional[str]
|
|
250
|
+
is_augmented: bool
|
|
250
251
|
|
|
251
252
|
|
|
252
253
|
class GeneratedFeatureMetadata(BaseModel):
|
|
253
254
|
formula: str # on hashed names
|
|
254
|
-
display_index:
|
|
255
|
+
display_index: str
|
|
255
256
|
base_columns: List[BaseColumnMetadata]
|
|
256
257
|
|
|
257
258
|
|
upgini/utils/cv_utils.py
CHANGED
|
@@ -22,7 +22,7 @@ class CVConfig:
|
|
|
22
22
|
elif isinstance(cv_type, CVType):
|
|
23
23
|
self.cv_type = cv_type
|
|
24
24
|
else:
|
|
25
|
-
raise Exception(f"
|
|
25
|
+
raise Exception(f"Unexpected type of cv_type: {type(cv_type)}")
|
|
26
26
|
|
|
27
27
|
self.shuffle_kfold: Optional[bool] = shuffle_kfold
|
|
28
28
|
self.test_size = 0.2
|
|
@@ -2,16 +2,24 @@ upgini/__init__.py,sha256=asENHgEVHQBIkV-e_0IhE_ZWqkCG6398U3ZLrNzAH6k,407
|
|
|
2
2
|
upgini/ads.py,sha256=mre6xn44wcC_fg63iLT_kTh4mViZqR9AKRJZAtpQz8Y,2592
|
|
3
3
|
upgini/dataset.py,sha256=7z9zbVvd1_MiufmoZlCwEHwQ25Q2DX_0g9PFcSMlqMY,49764
|
|
4
4
|
upgini/errors.py,sha256=BqpvfhW2jJW5fa5KXj0alhXatGl-WK4xTl309-QNLp8,959
|
|
5
|
-
upgini/features_enricher.py,sha256=
|
|
5
|
+
upgini/features_enricher.py,sha256=2B9rk_8QNMV7o1khbgZX8A1T6vJqyfki4F4UAYoR0po,158857
|
|
6
6
|
upgini/fingerprint.js,sha256=VygVIQlN1v4NGZfjHqtRogOw8zjTnnMNJg_f7M5iGQU,33442
|
|
7
7
|
upgini/http.py,sha256=HzUSZudCdISJGUqHC1gAT1v_x1n_dIFVDJW4z3Q7DCs,41204
|
|
8
|
-
upgini/metadata.py,sha256=
|
|
8
|
+
upgini/metadata.py,sha256=FZ5CQluLLWrfrBVThSIes1SW6wcs7n50aNZwzYnHiF0,9584
|
|
9
9
|
upgini/metrics.py,sha256=YeYHJtEIs8OG-EzidG-nbSYB919pjZ4MMbdcZ_jfV2s,23639
|
|
10
10
|
upgini/search_task.py,sha256=7YxH1zrUHMmePO0VbPBBCJjeoer7jAC0Gltc9EVAOIg,17126
|
|
11
11
|
upgini/spinner.py,sha256=yhakBaydMNS8E8TRAwTdCMdnWrHeWT0cR1M8c9hP6jA,1157
|
|
12
12
|
upgini/version_validator.py,sha256=rDIncP6BEko4J2F2hUcMOtKm_vZbI4ICWcNcw8hrwM4,1400
|
|
13
13
|
upgini/ads_management/__init__.py,sha256=qzyisOToVRP-tquAJD1PblZhNtMrOB8FiyF9JvfkvgE,50
|
|
14
14
|
upgini/ads_management/ads_manager.py,sha256=O6Pcl_y5e_ULfQ-xmGGn_qBP4z7EtV7TP9etjrsLkLE,2647
|
|
15
|
+
upgini/autofe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
16
|
+
upgini/autofe/all_operands.py,sha256=du44N6ISWe3ikb0y9ZzSOHNbLiyEYrJPwoBo0Z6xp2s,1487
|
|
17
|
+
upgini/autofe/binary.py,sha256=f8LQqZi9zyaMUAv-jASMmWNA_vT05ncYCjZq0qx3USs,3972
|
|
18
|
+
upgini/autofe/feature.py,sha256=cElNcLfw9BeBVUkkaFzWWXrnyWNUCXiw0FGqsitorbE,10133
|
|
19
|
+
upgini/autofe/groupby.py,sha256=iXRfOmOc84ooSzRhsh9GmmG7rTafX0-ekXko8s9Qs68,3089
|
|
20
|
+
upgini/autofe/operand.py,sha256=8WqEoSIA5rEWCK1xuC303E4NW5a72GZ5jUMAEj4skII,2291
|
|
21
|
+
upgini/autofe/unary.py,sha256=7TBe7PCt7l_XQEqu_G5g_TC2cW3tppL7uPDcX8xsqz0,2731
|
|
22
|
+
upgini/autofe/vector.py,sha256=Qk7VmdwURNwVw7fIMEspWEo7HTiyUWCYIqu3hcWQQio,507
|
|
15
23
|
upgini/data_source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
16
24
|
upgini/data_source/data_source_publisher.py,sha256=zFu0WMKwPM11gPZHq8dpsBP7s4wmTtBqYoDEakgNxoY,13725
|
|
17
25
|
upgini/mdc/__init__.py,sha256=CuKmWYCqAnmiq1S7wgMzJhSCTsXuoeiZWXSfzw0lyig,1152
|
|
@@ -30,7 +38,7 @@ upgini/utils/base_search_key_detector.py,sha256=DGwhXLvc8i5VZWMDr0rncFfV5GEHdsCS
|
|
|
30
38
|
upgini/utils/blocked_time_series.py,sha256=dMz5ewk3PsoeOrc3lDzInCVPS9u_2XQkV0W6PuMMjPg,3380
|
|
31
39
|
upgini/utils/country_utils.py,sha256=9BXSXoGm3nVoOZE_bRENY-KMkwMUFvAF3Au0zxUNA1o,6436
|
|
32
40
|
upgini/utils/custom_loss_utils.py,sha256=DBslpjWGPt7xTeypt78baR59012SYphbPsO_YLKdilo,3972
|
|
33
|
-
upgini/utils/cv_utils.py,sha256=
|
|
41
|
+
upgini/utils/cv_utils.py,sha256=6pSSL_Ft_8C6n6aInJeiyeSBD7McjsMxKZpHqSBV0uY,2491
|
|
34
42
|
upgini/utils/datetime_utils.py,sha256=P56e7gcgAogJYfs2Blzk1uypxb9yrFzNaeJpMCRm6Zc,7716
|
|
35
43
|
upgini/utils/display_utils.py,sha256=tiq5sFOfMwkKCjQ7OGdyK_twe0Qdr9F3mzkW1QXSDog,10664
|
|
36
44
|
upgini/utils/email_utils.py,sha256=MhCLUAWqbp81xRyKizauNhVx6t_MFeJQRQ8pFM7EpFo,3480
|
|
@@ -45,8 +53,8 @@ upgini/utils/sklearn_ext.py,sha256=IMx2La70AXAggApVpT7sMEjWqVWon5AMZt4MARDsIMQ,4
|
|
|
45
53
|
upgini/utils/target_utils.py,sha256=cu52icjhDIPpEStHYMXrD2hIl9gzvfnxZr0Ra5osV0k,1616
|
|
46
54
|
upgini/utils/track_info.py,sha256=DVNVZmXUb4f25DSPEuUNEFx49hNEBfmuY9iSW5jkMnI,5708
|
|
47
55
|
upgini/utils/warning_counter.py,sha256=vnmdFo5-7GBkU2bK9h_uC0K0Y_wtfcYstxOdeRfacO0,228
|
|
48
|
-
upgini-1.1.
|
|
49
|
-
upgini-1.1.
|
|
50
|
-
upgini-1.1.
|
|
51
|
-
upgini-1.1.
|
|
52
|
-
upgini-1.1.
|
|
56
|
+
upgini-1.1.231.dist-info/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
|
57
|
+
upgini-1.1.231.dist-info/METADATA,sha256=itfTK0u6dBS8WjaITVm9TcsXC2mgQ_W0LRBqB_Nialk,48398
|
|
58
|
+
upgini-1.1.231.dist-info/WHEEL,sha256=G16H4A3IeoQmnOrYV4ueZGKSjhipXx8zc8nu9FGlvMA,92
|
|
59
|
+
upgini-1.1.231.dist-info/top_level.txt,sha256=OFhTGiDIWKl5gFI49qvWq1R9IKflPaE2PekcbDXDtx4,7
|
|
60
|
+
upgini-1.1.231.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|