upgini 1.1.312a5__tar.gz → 1.1.313__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- {upgini-1.1.312a5 → upgini-1.1.313}/PKG-INFO +3 -1
- {upgini-1.1.312a5 → upgini-1.1.313}/pyproject.toml +2 -0
- upgini-1.1.313/src/upgini/__about__.py +1 -0
- {upgini-1.1.312a5 → upgini-1.1.313}/src/upgini/autofe/all_operands.py +26 -7
- {upgini-1.1.312a5 → upgini-1.1.313}/src/upgini/autofe/binary.py +95 -4
- {upgini-1.1.312a5 → upgini-1.1.313}/src/upgini/autofe/date.py +26 -6
- {upgini-1.1.312a5 → upgini-1.1.313}/src/upgini/autofe/feature.py +25 -11
- {upgini-1.1.312a5 → upgini-1.1.313}/src/upgini/autofe/unary.py +7 -0
- {upgini-1.1.312a5 → upgini-1.1.313}/src/upgini/dataset.py +386 -33
- {upgini-1.1.312a5 → upgini-1.1.313}/src/upgini/features_enricher.py +145 -295
- {upgini-1.1.312a5 → upgini-1.1.313}/src/upgini/metadata.py +1 -16
- upgini-1.1.312a5/src/upgini/utils/phone_utils.py → upgini-1.1.313/src/upgini/normalizer/phone_normalizer.py +27 -43
- {upgini-1.1.312a5 → upgini-1.1.313}/src/upgini/utils/country_utils.py +0 -16
- {upgini-1.1.312a5 → upgini-1.1.313}/src/upgini/utils/datetime_utils.py +16 -38
- {upgini-1.1.312a5 → upgini-1.1.313}/src/upgini/utils/email_utils.py +17 -49
- upgini-1.1.313/src/upgini/utils/ip_utils.py +53 -0
- upgini-1.1.313/src/upgini/utils/phone_utils.py +11 -0
- upgini-1.1.313/src/upgini/utils/postal_code_utils.py +11 -0
- upgini-1.1.312a5/src/upgini/__about__.py +0 -1
- upgini-1.1.312a5/src/upgini/normalizer/normalize_utils.py +0 -203
- upgini-1.1.312a5/src/upgini/utils/ip_utils.py +0 -152
- upgini-1.1.312a5/src/upgini/utils/postal_code_utils.py +0 -45
- {upgini-1.1.312a5 → upgini-1.1.313}/.gitignore +0 -0
- {upgini-1.1.312a5 → upgini-1.1.313}/LICENSE +0 -0
- {upgini-1.1.312a5 → upgini-1.1.313}/README.md +0 -0
- {upgini-1.1.312a5 → upgini-1.1.313}/src/upgini/__init__.py +0 -0
- {upgini-1.1.312a5 → upgini-1.1.313}/src/upgini/ads.py +0 -0
- {upgini-1.1.312a5 → upgini-1.1.313}/src/upgini/ads_management/__init__.py +0 -0
- {upgini-1.1.312a5 → upgini-1.1.313}/src/upgini/ads_management/ads_manager.py +0 -0
- {upgini-1.1.312a5 → upgini-1.1.313}/src/upgini/autofe/__init__.py +0 -0
- {upgini-1.1.312a5 → upgini-1.1.313}/src/upgini/autofe/groupby.py +0 -0
- {upgini-1.1.312a5 → upgini-1.1.313}/src/upgini/autofe/operand.py +0 -0
- {upgini-1.1.312a5 → upgini-1.1.313}/src/upgini/autofe/vector.py +0 -0
- {upgini-1.1.312a5 → upgini-1.1.313}/src/upgini/data_source/__init__.py +0 -0
- {upgini-1.1.312a5 → upgini-1.1.313}/src/upgini/data_source/data_source_publisher.py +0 -0
- {upgini-1.1.312a5 → upgini-1.1.313}/src/upgini/errors.py +0 -0
- {upgini-1.1.312a5 → upgini-1.1.313}/src/upgini/http.py +0 -0
- {upgini-1.1.312a5 → upgini-1.1.313}/src/upgini/lazy_import.py +0 -0
- {upgini-1.1.312a5 → upgini-1.1.313}/src/upgini/mdc/__init__.py +0 -0
- {upgini-1.1.312a5 → upgini-1.1.313}/src/upgini/mdc/context.py +0 -0
- {upgini-1.1.312a5 → upgini-1.1.313}/src/upgini/metrics.py +0 -0
- {upgini-1.1.312a5 → upgini-1.1.313}/src/upgini/normalizer/__init__.py +0 -0
- {upgini-1.1.312a5 → upgini-1.1.313}/src/upgini/resource_bundle/__init__.py +0 -0
- {upgini-1.1.312a5 → upgini-1.1.313}/src/upgini/resource_bundle/exceptions.py +0 -0
- {upgini-1.1.312a5 → upgini-1.1.313}/src/upgini/resource_bundle/strings.properties +0 -0
- {upgini-1.1.312a5 → upgini-1.1.313}/src/upgini/resource_bundle/strings_widget.properties +0 -0
- {upgini-1.1.312a5 → upgini-1.1.313}/src/upgini/sampler/__init__.py +0 -0
- {upgini-1.1.312a5 → upgini-1.1.313}/src/upgini/sampler/base.py +0 -0
- {upgini-1.1.312a5 → upgini-1.1.313}/src/upgini/sampler/random_under_sampler.py +0 -0
- {upgini-1.1.312a5 → upgini-1.1.313}/src/upgini/sampler/utils.py +0 -0
- {upgini-1.1.312a5 → upgini-1.1.313}/src/upgini/search_task.py +0 -0
- {upgini-1.1.312a5 → upgini-1.1.313}/src/upgini/spinner.py +0 -0
- {upgini-1.1.312a5 → upgini-1.1.313}/src/upgini/utils/__init__.py +0 -0
- {upgini-1.1.312a5 → upgini-1.1.313}/src/upgini/utils/base_search_key_detector.py +0 -0
- {upgini-1.1.312a5 → upgini-1.1.313}/src/upgini/utils/blocked_time_series.py +0 -0
- {upgini-1.1.312a5 → upgini-1.1.313}/src/upgini/utils/custom_loss_utils.py +0 -0
- {upgini-1.1.312a5 → upgini-1.1.313}/src/upgini/utils/cv_utils.py +0 -0
- {upgini-1.1.312a5 → upgini-1.1.313}/src/upgini/utils/deduplicate_utils.py +0 -0
- {upgini-1.1.312a5 → upgini-1.1.313}/src/upgini/utils/display_utils.py +0 -0
- {upgini-1.1.312a5 → upgini-1.1.313}/src/upgini/utils/fallback_progress_bar.py +0 -0
- {upgini-1.1.312a5 → upgini-1.1.313}/src/upgini/utils/features_validator.py +0 -0
- {upgini-1.1.312a5 → upgini-1.1.313}/src/upgini/utils/format.py +0 -0
- {upgini-1.1.312a5 → upgini-1.1.313}/src/upgini/utils/progress_bar.py +0 -0
- {upgini-1.1.312a5 → upgini-1.1.313}/src/upgini/utils/sklearn_ext.py +0 -0
- {upgini-1.1.312a5 → upgini-1.1.313}/src/upgini/utils/target_utils.py +0 -0
- {upgini-1.1.312a5 → upgini-1.1.313}/src/upgini/utils/track_info.py +0 -0
- {upgini-1.1.312a5 → upgini-1.1.313}/src/upgini/utils/warning_counter.py +0 -0
- {upgini-1.1.312a5 → upgini-1.1.313}/src/upgini/version_validator.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: upgini
|
|
3
|
-
Version: 1.1.
|
|
3
|
+
Version: 1.1.313
|
|
4
4
|
Summary: Intelligent data search & enrichment for Machine Learning
|
|
5
5
|
Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
|
|
6
6
|
Project-URL: Homepage, https://upgini.com/
|
|
@@ -26,6 +26,8 @@ Requires-Python: <3.11,>=3.8
|
|
|
26
26
|
Requires-Dist: catboost>=1.0.3
|
|
27
27
|
Requires-Dist: fastparquet>=0.8.1
|
|
28
28
|
Requires-Dist: ipywidgets>=8.1.0
|
|
29
|
+
Requires-Dist: jarowinkler>=2.0.0
|
|
30
|
+
Requires-Dist: levenshtein>=0.25.1
|
|
29
31
|
Requires-Dist: lightgbm>=3.3.2
|
|
30
32
|
Requires-Dist: numpy>=1.19.0
|
|
31
33
|
Requires-Dist: pandas<3.0.0,>=1.1.0
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "1.1.313"
|
|
@@ -1,6 +1,20 @@
|
|
|
1
1
|
from typing import Dict
|
|
2
2
|
|
|
3
|
-
from upgini.autofe.binary import
|
|
3
|
+
from upgini.autofe.binary import (
|
|
4
|
+
Add,
|
|
5
|
+
Combine,
|
|
6
|
+
CombineThenFreq,
|
|
7
|
+
Distance,
|
|
8
|
+
Divide,
|
|
9
|
+
JaroWinklerSim1,
|
|
10
|
+
JaroWinklerSim2,
|
|
11
|
+
LevenshteinSim,
|
|
12
|
+
Max,
|
|
13
|
+
Min,
|
|
14
|
+
Multiply,
|
|
15
|
+
Sim,
|
|
16
|
+
Subtract,
|
|
17
|
+
)
|
|
4
18
|
from upgini.autofe.date import (
|
|
5
19
|
DateDiff,
|
|
6
20
|
DateDiffType2,
|
|
@@ -9,9 +23,9 @@ from upgini.autofe.date import (
|
|
|
9
23
|
DatePercentile,
|
|
10
24
|
DatePercentileMethod2,
|
|
11
25
|
)
|
|
12
|
-
from upgini.autofe.groupby import GroupByThenAgg, GroupByThenRank
|
|
26
|
+
from upgini.autofe.groupby import GroupByThenAgg, GroupByThenFreq, GroupByThenNUnique, GroupByThenRank
|
|
13
27
|
from upgini.autofe.operand import Operand
|
|
14
|
-
from upgini.autofe.unary import Abs, Floor, Freq, Log, Residual, Norm, Sigmoid, Sqrt, Square
|
|
28
|
+
from upgini.autofe.unary import Abs, Embeddings, Floor, Freq, Log, Residual, Norm, Sigmoid, Sqrt, Square
|
|
15
29
|
from upgini.autofe.vector import Mean, Sum
|
|
16
30
|
|
|
17
31
|
ALL_OPERANDS: Dict[str, Operand] = {
|
|
@@ -39,10 +53,10 @@ ALL_OPERANDS: Dict[str, Operand] = {
|
|
|
39
53
|
GroupByThenAgg(name="GroupByThenMedian", agg="median"),
|
|
40
54
|
GroupByThenAgg(name="GroupByThenStd", output_type="float", agg="std"),
|
|
41
55
|
GroupByThenRank(),
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
56
|
+
Combine(),
|
|
57
|
+
CombineThenFreq(),
|
|
58
|
+
GroupByThenNUnique(),
|
|
59
|
+
GroupByThenFreq(),
|
|
46
60
|
Sim(),
|
|
47
61
|
DateDiff(),
|
|
48
62
|
DateDiffType2(),
|
|
@@ -59,6 +73,11 @@ ALL_OPERANDS: Dict[str, Operand] = {
|
|
|
59
73
|
DatePercentile(),
|
|
60
74
|
DatePercentileMethod2(),
|
|
61
75
|
Norm(),
|
|
76
|
+
JaroWinklerSim1(),
|
|
77
|
+
JaroWinklerSim2(),
|
|
78
|
+
LevenshteinSim(),
|
|
79
|
+
Distance(),
|
|
80
|
+
Embeddings(),
|
|
62
81
|
]
|
|
63
82
|
}
|
|
64
83
|
|
|
@@ -1,7 +1,9 @@
|
|
|
1
|
+
import abc
|
|
2
|
+
from typing import Optional
|
|
3
|
+
import Levenshtein
|
|
1
4
|
import numpy as np
|
|
2
5
|
import pandas as pd
|
|
3
|
-
from
|
|
4
|
-
from numpy.linalg import norm
|
|
6
|
+
from jarowinkler import jarowinkler_similarity
|
|
5
7
|
|
|
6
8
|
from upgini.autofe.operand import PandasOperand, VectorizableMixin
|
|
7
9
|
|
|
@@ -130,7 +132,29 @@ class CombineThenFreq(PandasOperand):
|
|
|
130
132
|
self._loc(temp, value_counts)
|
|
131
133
|
|
|
132
134
|
|
|
133
|
-
class
|
|
135
|
+
class Distance(PandasOperand):
|
|
136
|
+
name = "dist"
|
|
137
|
+
is_binary = True
|
|
138
|
+
output_type = "float"
|
|
139
|
+
is_symmetrical = True
|
|
140
|
+
has_symmetry_importance = True
|
|
141
|
+
|
|
142
|
+
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
143
|
+
return pd.Series(
|
|
144
|
+
1 - self.__dot(left, right) / (self.__dot(left, left) * self.__dot(right, right)), index=left.index
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
# row-wise dot product
|
|
148
|
+
def __dot(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
149
|
+
left = left.apply(lambda x: np.array(x))
|
|
150
|
+
right = right.apply(lambda x: np.array(x))
|
|
151
|
+
res = (left.dropna() * right.dropna()).apply(np.sum)
|
|
152
|
+
res = res.reindex(left.index.union(right.index))
|
|
153
|
+
return res
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
# Left for backward compatibility
|
|
157
|
+
class Sim(Distance):
|
|
134
158
|
name = "sim"
|
|
135
159
|
is_binary = True
|
|
136
160
|
output_type = "float"
|
|
@@ -138,4 +162,71 @@ class Sim(PandasOperand):
|
|
|
138
162
|
has_symmetry_importance = True
|
|
139
163
|
|
|
140
164
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
141
|
-
return
|
|
165
|
+
return 1 - super().calculate_binary(left, right)
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
class StringSim(PandasOperand, abc.ABC):
|
|
169
|
+
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
170
|
+
sims = []
|
|
171
|
+
for i in left.index:
|
|
172
|
+
left_i = self._prepare_value(left.get(i))
|
|
173
|
+
right_i = self._prepare_value(right.get(i))
|
|
174
|
+
if left_i is not None and right_i is not None:
|
|
175
|
+
sims.append(self._similarity(left_i, right_i))
|
|
176
|
+
else:
|
|
177
|
+
sims.append(None)
|
|
178
|
+
|
|
179
|
+
return pd.Series(sims, index=left.index)
|
|
180
|
+
|
|
181
|
+
@abc.abstractmethod
|
|
182
|
+
def _prepare_value(self, value: Optional[str]) -> Optional[str]:
|
|
183
|
+
pass
|
|
184
|
+
|
|
185
|
+
@abc.abstractmethod
|
|
186
|
+
def _similarity(self, left: str, right: str) -> float:
|
|
187
|
+
pass
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
class JaroWinklerSim1(StringSim):
|
|
191
|
+
name = "sim_jw1"
|
|
192
|
+
is_binary = True
|
|
193
|
+
input_type = "string"
|
|
194
|
+
output_type = "float"
|
|
195
|
+
is_symmetrical = True
|
|
196
|
+
has_symmetry_importance = True
|
|
197
|
+
|
|
198
|
+
def _prepare_value(self, value: Optional[str]) -> Optional[str]:
|
|
199
|
+
return value
|
|
200
|
+
|
|
201
|
+
def _similarity(self, left: str, right: str) -> float:
|
|
202
|
+
return jarowinkler_similarity(left, right)
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
class JaroWinklerSim2(StringSim):
|
|
206
|
+
name = "sim_jw2"
|
|
207
|
+
is_binary = True
|
|
208
|
+
input_type = "string"
|
|
209
|
+
output_type = "float"
|
|
210
|
+
is_symmetrical = True
|
|
211
|
+
has_symmetry_importance = True
|
|
212
|
+
|
|
213
|
+
def _prepare_value(self, value: Optional[str]) -> Optional[str]:
|
|
214
|
+
return value[::-1] if value is not None else None
|
|
215
|
+
|
|
216
|
+
def _similarity(self, left: str, right: str) -> float:
|
|
217
|
+
return jarowinkler_similarity(left, right)
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
class LevenshteinSim(StringSim):
|
|
221
|
+
name = "sim_lv"
|
|
222
|
+
is_binary = True
|
|
223
|
+
input_type = "string"
|
|
224
|
+
output_type = "float"
|
|
225
|
+
is_symmetrical = True
|
|
226
|
+
has_symmetry_importance = True
|
|
227
|
+
|
|
228
|
+
def _prepare_value(self, value: Optional[str]) -> Optional[str]:
|
|
229
|
+
return value
|
|
230
|
+
|
|
231
|
+
def _similarity(self, left: str, right: str) -> float:
|
|
232
|
+
return 1 - Levenshtein.distance(left, right) / max(len(left), len(right))
|
|
@@ -20,7 +20,7 @@ class DateDiffMixin(BaseModel):
|
|
|
20
20
|
if isinstance(x, pd.DataFrame):
|
|
21
21
|
return x.apply(lambda y: self._convert_to_date(y, unit), axis=1)
|
|
22
22
|
|
|
23
|
-
return pd.to_datetime(x, unit=unit, errors=
|
|
23
|
+
return pd.to_datetime(x, unit=unit, errors="coerce")
|
|
24
24
|
|
|
25
25
|
def _convert_diff_to_unit(self, diff: Union[pd.Series, TimedeltaArray]) -> Union[pd.Series, TimedeltaArray]:
|
|
26
26
|
if self.diff_unit == "D":
|
|
@@ -43,6 +43,8 @@ class DateDiff(PandasOperand, DateDiffMixin):
|
|
|
43
43
|
is_binary = True
|
|
44
44
|
has_symmetry_importance = True
|
|
45
45
|
|
|
46
|
+
replace_negative: bool = False
|
|
47
|
+
|
|
46
48
|
def get_params(self) -> Dict[str, Optional[str]]:
|
|
47
49
|
res = super().get_params()
|
|
48
50
|
res.update(
|
|
@@ -50,6 +52,7 @@ class DateDiff(PandasOperand, DateDiffMixin):
|
|
|
50
52
|
"diff_unit": self.diff_unit,
|
|
51
53
|
"left_unit": self.left_unit,
|
|
52
54
|
"right_unit": self.right_unit,
|
|
55
|
+
"replace_negative": self.replace_negative,
|
|
53
56
|
}
|
|
54
57
|
)
|
|
55
58
|
return res
|
|
@@ -61,7 +64,8 @@ class DateDiff(PandasOperand, DateDiffMixin):
|
|
|
61
64
|
return self.__replace_negative(diff)
|
|
62
65
|
|
|
63
66
|
def __replace_negative(self, x: Union[pd.DataFrame, pd.Series]):
|
|
64
|
-
|
|
67
|
+
if self.replace_negative:
|
|
68
|
+
x[x < 0] = None
|
|
65
69
|
return x
|
|
66
70
|
|
|
67
71
|
|
|
@@ -96,18 +100,25 @@ class DateDiffType2(PandasOperand, DateDiffMixin):
|
|
|
96
100
|
|
|
97
101
|
|
|
98
102
|
_ext_aggregations = {"nunique": (lambda x: len(np.unique(x)), 0), "count": (len, 0)}
|
|
103
|
+
_count_aggregations = ["nunique", "count"]
|
|
99
104
|
|
|
100
105
|
|
|
101
106
|
class DateListDiff(PandasOperand, DateDiffMixin):
|
|
102
107
|
is_binary = True
|
|
103
108
|
has_symmetry_importance = True
|
|
109
|
+
|
|
104
110
|
aggregation: str
|
|
111
|
+
replace_negative: bool = False
|
|
105
112
|
|
|
106
113
|
def get_params(self) -> Dict[str, Optional[str]]:
|
|
107
114
|
res = super().get_params()
|
|
108
115
|
res.update(
|
|
109
116
|
{
|
|
110
117
|
"aggregation": self.aggregation,
|
|
118
|
+
"diff_unit": self.diff_unit,
|
|
119
|
+
"left_unit": self.left_unit,
|
|
120
|
+
"right_unit": self.right_unit,
|
|
121
|
+
"replace_negative": self.replace_negative,
|
|
111
122
|
}
|
|
112
123
|
)
|
|
113
124
|
return res
|
|
@@ -119,13 +130,19 @@ class DateListDiff(PandasOperand, DateDiffMixin):
|
|
|
119
130
|
|
|
120
131
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
121
132
|
left = self._convert_to_date(left, self.left_unit)
|
|
122
|
-
|
|
133
|
+
right_mask = right.apply(lambda x: len(x) > 0)
|
|
134
|
+
mask = left.notna() & right.notna() & right_mask
|
|
135
|
+
right_masked = right[mask].apply(lambda x: pd.arrays.DatetimeArray(self._convert_to_date(x, self.right_unit)))
|
|
136
|
+
res_masked = pd.Series(left[mask] - right_masked.values).apply(lambda x: self._agg(self._diff(x)))
|
|
137
|
+
res = res_masked.reindex(left.index.union(right.index))
|
|
138
|
+
if self.aggregation in _count_aggregations:
|
|
139
|
+
res[~right_mask] = 0.0
|
|
123
140
|
|
|
124
|
-
return
|
|
141
|
+
return res
|
|
125
142
|
|
|
126
143
|
def _diff(self, x: TimedeltaArray):
|
|
127
144
|
x = self._convert_diff_to_unit(x)
|
|
128
|
-
return x[x > 0]
|
|
145
|
+
return x[x > 0] if self.replace_negative else x
|
|
129
146
|
|
|
130
147
|
def _agg(self, x):
|
|
131
148
|
method = getattr(np, self.aggregation, None)
|
|
@@ -157,7 +174,10 @@ class DateListDiffBounded(DateListDiff):
|
|
|
157
174
|
super().__init__(**data)
|
|
158
175
|
|
|
159
176
|
def _agg(self, x):
|
|
160
|
-
x = x[
|
|
177
|
+
x = x[
|
|
178
|
+
(x >= (self.lower_bound if self.lower_bound is not None else -np.inf))
|
|
179
|
+
& (x < (self.upper_bound if self.upper_bound is not None else np.inf))
|
|
180
|
+
]
|
|
161
181
|
return super()._agg(x)
|
|
162
182
|
|
|
163
183
|
|
|
@@ -138,15 +138,17 @@ class Feature:
|
|
|
138
138
|
if self.cached_display_name is not None and cache:
|
|
139
139
|
return self.cached_display_name
|
|
140
140
|
|
|
141
|
+
should_stack_op = not isinstance(self.children[0], Column) if self.op.is_unary else False
|
|
142
|
+
prev_name = [self.children[0].get_op_display_name()] if should_stack_op else []
|
|
143
|
+
|
|
141
144
|
if self.alias:
|
|
142
145
|
components = ["f_autofe", self.alias]
|
|
143
|
-
elif shorten and not self.op.is_unary:
|
|
144
|
-
components = ["f_autofe"
|
|
146
|
+
elif shorten and (not self.op.is_unary or should_stack_op):
|
|
147
|
+
components = ["f_autofe"] + prev_name + [self.get_op_display_name()]
|
|
145
148
|
else:
|
|
146
|
-
components =
|
|
147
|
-
"autofe"
|
|
148
|
-
|
|
149
|
-
]
|
|
149
|
+
components = (
|
|
150
|
+
["f_" + "_f_".join(self.get_columns(**kwargs))] + ["autofe"] + prev_name + [self.get_op_display_name()]
|
|
151
|
+
)
|
|
150
152
|
components.extend([str(self.display_index)] if self.display_index is not None else [])
|
|
151
153
|
display_name = "_".join(components)
|
|
152
154
|
|
|
@@ -237,12 +239,19 @@ class Feature:
|
|
|
237
239
|
|
|
238
240
|
@staticmethod
|
|
239
241
|
def from_formula(string: str) -> Union[Column, "Feature"]:
|
|
240
|
-
if string[-1] != ")":
|
|
241
|
-
return Column(string)
|
|
242
242
|
|
|
243
243
|
def is_trivial_char(c: str) -> bool:
|
|
244
244
|
return c not in "()+-*/,"
|
|
245
245
|
|
|
246
|
+
if string[-1] != ")":
|
|
247
|
+
if all(is_trivial_char(c) for c in string):
|
|
248
|
+
return Column(string)
|
|
249
|
+
else:
|
|
250
|
+
raise ValueError(
|
|
251
|
+
f"Unsupported column name: {string}. Column names should not have characters: "
|
|
252
|
+
"['(', ')', '+', '-', '*', '/', ',']"
|
|
253
|
+
)
|
|
254
|
+
|
|
246
255
|
def find_prev(string: str) -> int:
|
|
247
256
|
if string[-1] != ")":
|
|
248
257
|
return max([(0 if is_trivial_char(c) else i + 1) for i, c in enumerate(string)])
|
|
@@ -264,8 +273,11 @@ class Feature:
|
|
|
264
273
|
return Feature(find_op(string[: p2 - 1]), [Feature.from_formula(string[p2:-1])])
|
|
265
274
|
p1 = find_prev(string[: p2 - 1])
|
|
266
275
|
if string[0] == "(":
|
|
276
|
+
op = find_op(string[p2 - 1])
|
|
277
|
+
if op is None:
|
|
278
|
+
raise ValueError(f"Unsupported operand: {string[p2 - 1]}")
|
|
267
279
|
return Feature(
|
|
268
|
-
|
|
280
|
+
op,
|
|
269
281
|
[Feature.from_formula(string[p1 : p2 - 1]), Feature.from_formula(string[p2:-1])],
|
|
270
282
|
)
|
|
271
283
|
else:
|
|
@@ -276,6 +288,8 @@ class Feature:
|
|
|
276
288
|
[Feature.from_formula(string[p1 : p2 - 1]), Feature.from_formula(string[p2:-1])],
|
|
277
289
|
)
|
|
278
290
|
else:
|
|
291
|
+
if string[p1 - 1] == "(":
|
|
292
|
+
raise ValueError(f"Unsupported operand: {string[: p1 - 1]}")
|
|
279
293
|
base_features = [
|
|
280
294
|
Feature.from_formula(string[p2:-1]),
|
|
281
295
|
Feature.from_formula(string[p1 : p2 - 1]),
|
|
@@ -321,10 +335,10 @@ class FeatureGroup:
|
|
|
321
335
|
lower_order_names = [ch.get_display_name() for ch in lower_order_children]
|
|
322
336
|
if any(isinstance(f, Feature) for f in lower_order_children):
|
|
323
337
|
child_data = pd.concat(
|
|
324
|
-
[data[main_column]] + [ch.calculate(data) for ch in lower_order_children],
|
|
338
|
+
[data[main_column or []]] + [ch.calculate(data) for ch in lower_order_children],
|
|
325
339
|
axis=1,
|
|
326
340
|
)
|
|
327
|
-
child_data.columns = [main_column] + lower_order_names
|
|
341
|
+
child_data.columns = ([main_column] if main_column is not None else []) + lower_order_names
|
|
328
342
|
else:
|
|
329
343
|
child_data = data[columns]
|
|
330
344
|
|
|
@@ -125,3 +125,10 @@ class Norm(PandasOperand):
|
|
|
125
125
|
normalized_data = pd.Series(normalized_data[:, 0], index=data_dropna.index, name=data.name)
|
|
126
126
|
normalized_data = normalized_data.reindex(data.index)
|
|
127
127
|
return normalized_data
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
class Embeddings(PandasOperand):
|
|
131
|
+
name = "emb"
|
|
132
|
+
is_unary = True
|
|
133
|
+
input_type = "string"
|
|
134
|
+
output_type = "vector"
|