upgini 1.1.309a1__tar.gz → 1.1.309a3511.dev1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- {upgini-1.1.309a1 → upgini-1.1.309a3511.dev1}/PKG-INFO +3 -1
- {upgini-1.1.309a1 → upgini-1.1.309a3511.dev1}/pyproject.toml +3 -0
- upgini-1.1.309a3511.dev1/src/upgini/__about__.py +1 -0
- {upgini-1.1.309a1 → upgini-1.1.309a3511.dev1}/src/upgini/autofe/all_operands.py +33 -7
- {upgini-1.1.309a1 → upgini-1.1.309a3511.dev1}/src/upgini/autofe/binary.py +93 -2
- {upgini-1.1.309a1 → upgini-1.1.309a3511.dev1}/src/upgini/autofe/date.py +16 -3
- {upgini-1.1.309a1 → upgini-1.1.309a3511.dev1}/src/upgini/autofe/feature.py +24 -11
- {upgini-1.1.309a1 → upgini-1.1.309a3511.dev1}/src/upgini/autofe/unary.py +7 -0
- {upgini-1.1.309a1 → upgini-1.1.309a3511.dev1}/src/upgini/dataset.py +385 -30
- {upgini-1.1.309a1 → upgini-1.1.309a3511.dev1}/src/upgini/features_enricher.py +120 -276
- {upgini-1.1.309a1 → upgini-1.1.309a3511.dev1}/src/upgini/metadata.py +1 -16
- upgini-1.1.309a1/src/upgini/utils/phone_utils.py → upgini-1.1.309a3511.dev1/src/upgini/normalizer/phone_normalizer.py +25 -41
- {upgini-1.1.309a1 → upgini-1.1.309a3511.dev1}/src/upgini/utils/country_utils.py +0 -16
- {upgini-1.1.309a1 → upgini-1.1.309a3511.dev1}/src/upgini/utils/datetime_utils.py +15 -34
- {upgini-1.1.309a1 → upgini-1.1.309a3511.dev1}/src/upgini/utils/email_utils.py +5 -19
- upgini-1.1.309a3511.dev1/src/upgini/utils/ip_utils.py +53 -0
- upgini-1.1.309a3511.dev1/src/upgini/utils/phone_utils.py +11 -0
- upgini-1.1.309a3511.dev1/src/upgini/utils/postal_code_utils.py +11 -0
- upgini-1.1.309a1/src/upgini/__about__.py +0 -1
- upgini-1.1.309a1/src/upgini/normalizer/normalize_utils.py +0 -203
- upgini-1.1.309a1/src/upgini/utils/ip_utils.py +0 -152
- upgini-1.1.309a1/src/upgini/utils/postal_code_utils.py +0 -45
- {upgini-1.1.309a1 → upgini-1.1.309a3511.dev1}/.gitignore +0 -0
- {upgini-1.1.309a1 → upgini-1.1.309a3511.dev1}/LICENSE +0 -0
- {upgini-1.1.309a1 → upgini-1.1.309a3511.dev1}/README.md +0 -0
- {upgini-1.1.309a1 → upgini-1.1.309a3511.dev1}/src/upgini/__init__.py +0 -0
- {upgini-1.1.309a1 → upgini-1.1.309a3511.dev1}/src/upgini/ads.py +0 -0
- {upgini-1.1.309a1 → upgini-1.1.309a3511.dev1}/src/upgini/ads_management/__init__.py +0 -0
- {upgini-1.1.309a1 → upgini-1.1.309a3511.dev1}/src/upgini/ads_management/ads_manager.py +0 -0
- {upgini-1.1.309a1 → upgini-1.1.309a3511.dev1}/src/upgini/autofe/__init__.py +0 -0
- {upgini-1.1.309a1 → upgini-1.1.309a3511.dev1}/src/upgini/autofe/groupby.py +0 -0
- {upgini-1.1.309a1 → upgini-1.1.309a3511.dev1}/src/upgini/autofe/operand.py +0 -0
- {upgini-1.1.309a1 → upgini-1.1.309a3511.dev1}/src/upgini/autofe/vector.py +0 -0
- {upgini-1.1.309a1 → upgini-1.1.309a3511.dev1}/src/upgini/data_source/__init__.py +0 -0
- {upgini-1.1.309a1 → upgini-1.1.309a3511.dev1}/src/upgini/data_source/data_source_publisher.py +0 -0
- {upgini-1.1.309a1 → upgini-1.1.309a3511.dev1}/src/upgini/errors.py +0 -0
- {upgini-1.1.309a1 → upgini-1.1.309a3511.dev1}/src/upgini/http.py +0 -0
- {upgini-1.1.309a1 → upgini-1.1.309a3511.dev1}/src/upgini/lazy_import.py +0 -0
- {upgini-1.1.309a1 → upgini-1.1.309a3511.dev1}/src/upgini/mdc/__init__.py +0 -0
- {upgini-1.1.309a1 → upgini-1.1.309a3511.dev1}/src/upgini/mdc/context.py +0 -0
- {upgini-1.1.309a1 → upgini-1.1.309a3511.dev1}/src/upgini/metrics.py +0 -0
- {upgini-1.1.309a1 → upgini-1.1.309a3511.dev1}/src/upgini/normalizer/__init__.py +0 -0
- {upgini-1.1.309a1 → upgini-1.1.309a3511.dev1}/src/upgini/resource_bundle/__init__.py +0 -0
- {upgini-1.1.309a1 → upgini-1.1.309a3511.dev1}/src/upgini/resource_bundle/exceptions.py +0 -0
- {upgini-1.1.309a1 → upgini-1.1.309a3511.dev1}/src/upgini/resource_bundle/strings.properties +0 -0
- {upgini-1.1.309a1 → upgini-1.1.309a3511.dev1}/src/upgini/resource_bundle/strings_widget.properties +0 -0
- {upgini-1.1.309a1 → upgini-1.1.309a3511.dev1}/src/upgini/sampler/__init__.py +0 -0
- {upgini-1.1.309a1 → upgini-1.1.309a3511.dev1}/src/upgini/sampler/base.py +0 -0
- {upgini-1.1.309a1 → upgini-1.1.309a3511.dev1}/src/upgini/sampler/random_under_sampler.py +0 -0
- {upgini-1.1.309a1 → upgini-1.1.309a3511.dev1}/src/upgini/sampler/utils.py +0 -0
- {upgini-1.1.309a1 → upgini-1.1.309a3511.dev1}/src/upgini/search_task.py +0 -0
- {upgini-1.1.309a1 → upgini-1.1.309a3511.dev1}/src/upgini/spinner.py +0 -0
- {upgini-1.1.309a1 → upgini-1.1.309a3511.dev1}/src/upgini/utils/__init__.py +0 -0
- {upgini-1.1.309a1 → upgini-1.1.309a3511.dev1}/src/upgini/utils/base_search_key_detector.py +0 -0
- {upgini-1.1.309a1 → upgini-1.1.309a3511.dev1}/src/upgini/utils/blocked_time_series.py +0 -0
- {upgini-1.1.309a1 → upgini-1.1.309a3511.dev1}/src/upgini/utils/custom_loss_utils.py +0 -0
- {upgini-1.1.309a1 → upgini-1.1.309a3511.dev1}/src/upgini/utils/cv_utils.py +0 -0
- {upgini-1.1.309a1 → upgini-1.1.309a3511.dev1}/src/upgini/utils/deduplicate_utils.py +0 -0
- {upgini-1.1.309a1 → upgini-1.1.309a3511.dev1}/src/upgini/utils/display_utils.py +0 -0
- {upgini-1.1.309a1 → upgini-1.1.309a3511.dev1}/src/upgini/utils/fallback_progress_bar.py +0 -0
- {upgini-1.1.309a1 → upgini-1.1.309a3511.dev1}/src/upgini/utils/features_validator.py +0 -0
- {upgini-1.1.309a1 → upgini-1.1.309a3511.dev1}/src/upgini/utils/format.py +0 -0
- {upgini-1.1.309a1 → upgini-1.1.309a3511.dev1}/src/upgini/utils/progress_bar.py +0 -0
- {upgini-1.1.309a1 → upgini-1.1.309a3511.dev1}/src/upgini/utils/sklearn_ext.py +0 -0
- {upgini-1.1.309a1 → upgini-1.1.309a3511.dev1}/src/upgini/utils/target_utils.py +0 -0
- {upgini-1.1.309a1 → upgini-1.1.309a3511.dev1}/src/upgini/utils/track_info.py +0 -0
- {upgini-1.1.309a1 → upgini-1.1.309a3511.dev1}/src/upgini/utils/warning_counter.py +0 -0
- {upgini-1.1.309a1 → upgini-1.1.309a3511.dev1}/src/upgini/version_validator.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: upgini
|
|
3
|
-
Version: 1.1.
|
|
3
|
+
Version: 1.1.309a3511.dev1
|
|
4
4
|
Summary: Intelligent data search & enrichment for Machine Learning
|
|
5
5
|
Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
|
|
6
6
|
Project-URL: Homepage, https://upgini.com/
|
|
@@ -26,6 +26,8 @@ Requires-Python: <3.11,>=3.8
|
|
|
26
26
|
Requires-Dist: catboost>=1.0.3
|
|
27
27
|
Requires-Dist: fastparquet>=0.8.1
|
|
28
28
|
Requires-Dist: ipywidgets>=8.1.0
|
|
29
|
+
Requires-Dist: jarowinkler>=2.0.0
|
|
30
|
+
Requires-Dist: levenshtein>=0.25.1
|
|
29
31
|
Requires-Dist: lightgbm>=3.3.2
|
|
30
32
|
Requires-Dist: numpy>=1.19.0
|
|
31
33
|
Requires-Dist: pandas<3.0.0,>=1.1.0
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "1.1.309a3511.dev1"
|
|
@@ -1,17 +1,38 @@
|
|
|
1
1
|
from typing import Dict
|
|
2
2
|
|
|
3
|
-
from upgini.autofe.binary import
|
|
3
|
+
from upgini.autofe.binary import (
|
|
4
|
+
Add,
|
|
5
|
+
Combine,
|
|
6
|
+
CombineThenFreq,
|
|
7
|
+
Distance,
|
|
8
|
+
Divide,
|
|
9
|
+
JaroWinklerSim1,
|
|
10
|
+
JaroWinklerSim2,
|
|
11
|
+
LevenshteinSim,
|
|
12
|
+
Max,
|
|
13
|
+
Min,
|
|
14
|
+
Multiply,
|
|
15
|
+
Sim,
|
|
16
|
+
Subtract,
|
|
17
|
+
)
|
|
4
18
|
from upgini.autofe.date import (
|
|
19
|
+
(
|
|
5
20
|
DateDiff,
|
|
21
|
+
|
|
6
22
|
DateDiffType2,
|
|
23
|
+
|
|
7
24
|
DateListDiff,
|
|
25
|
+
|
|
8
26
|
DateListDiffBounded,
|
|
9
27
|
DatePercentile,
|
|
28
|
+
|
|
29
|
+
DatePercentileMethod2,
|
|
30
|
+
),
|
|
10
31
|
DatePercentileMethod2,
|
|
11
32
|
)
|
|
12
|
-
from upgini.autofe.groupby import GroupByThenAgg, GroupByThenRank
|
|
33
|
+
from upgini.autofe.groupby import GroupByThenAgg, GroupByThenFreq, GroupByThenNUnique, GroupByThenRank
|
|
13
34
|
from upgini.autofe.operand import Operand
|
|
14
|
-
from upgini.autofe.unary import Abs, Floor, Freq, Log, Residual, Norm, Sigmoid, Sqrt, Square
|
|
35
|
+
from upgini.autofe.unary import Abs, Embeddings, Floor, Freq, Log, Residual, Norm, Sigmoid, Sqrt, Square
|
|
15
36
|
from upgini.autofe.vector import Mean, Sum
|
|
16
37
|
|
|
17
38
|
ALL_OPERANDS: Dict[str, Operand] = {
|
|
@@ -39,10 +60,10 @@ ALL_OPERANDS: Dict[str, Operand] = {
|
|
|
39
60
|
GroupByThenAgg(name="GroupByThenMedian", agg="median"),
|
|
40
61
|
GroupByThenAgg(name="GroupByThenStd", output_type="float", agg="std"),
|
|
41
62
|
GroupByThenRank(),
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
63
|
+
Combine(),
|
|
64
|
+
CombineThenFreq(),
|
|
65
|
+
GroupByThenNUnique(),
|
|
66
|
+
GroupByThenFreq(),
|
|
46
67
|
Sim(),
|
|
47
68
|
DateDiff(),
|
|
48
69
|
DateDiffType2(),
|
|
@@ -59,6 +80,11 @@ ALL_OPERANDS: Dict[str, Operand] = {
|
|
|
59
80
|
DatePercentile(),
|
|
60
81
|
DatePercentileMethod2(),
|
|
61
82
|
Norm(),
|
|
83
|
+
JaroWinklerSim1(),
|
|
84
|
+
JaroWinklerSim2(),
|
|
85
|
+
LevenshteinSim(),
|
|
86
|
+
Distance(),
|
|
87
|
+
Embeddings(),
|
|
62
88
|
]
|
|
63
89
|
}
|
|
64
90
|
|
|
@@ -1,7 +1,11 @@
|
|
|
1
|
+
import abc
|
|
2
|
+
from typing import Optional
|
|
3
|
+
import Levenshtein
|
|
1
4
|
import numpy as np
|
|
2
5
|
import pandas as pd
|
|
3
6
|
from numpy import dot
|
|
4
7
|
from numpy.linalg import norm
|
|
8
|
+
from jarowinkler import jarowinkler_similarity
|
|
5
9
|
|
|
6
10
|
from upgini.autofe.operand import PandasOperand, VectorizableMixin
|
|
7
11
|
|
|
@@ -130,7 +134,27 @@ class CombineThenFreq(PandasOperand):
|
|
|
130
134
|
self._loc(temp, value_counts)
|
|
131
135
|
|
|
132
136
|
|
|
133
|
-
class
|
|
137
|
+
class Distance(PandasOperand):
|
|
138
|
+
name = "dist"
|
|
139
|
+
is_binary = True
|
|
140
|
+
output_type = "float"
|
|
141
|
+
is_symmetrical = True
|
|
142
|
+
has_symmetry_importance = True
|
|
143
|
+
|
|
144
|
+
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
145
|
+
return pd.Series(
|
|
146
|
+
1 - self.__dot(left, right) / (self.__dot(left, left) * self.__dot(right, right)), index=left.index
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
# row-wise dot product
|
|
150
|
+
def __dot(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
151
|
+
res = (left.dropna() * right.dropna()).apply(np.sum)
|
|
152
|
+
res = res.reindex(left.index.union(right.index))
|
|
153
|
+
return res
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
# Left for backward compatibility
|
|
157
|
+
class Sim(Distance):
|
|
134
158
|
name = "sim"
|
|
135
159
|
is_binary = True
|
|
136
160
|
output_type = "float"
|
|
@@ -138,4 +162,71 @@ class Sim(PandasOperand):
|
|
|
138
162
|
has_symmetry_importance = True
|
|
139
163
|
|
|
140
164
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
141
|
-
return
|
|
165
|
+
return 1 - super().calculate_binary(left, right)
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
class StringSim(PandasOperand, abc.ABC):
|
|
169
|
+
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
170
|
+
sims = []
|
|
171
|
+
for i in left.index:
|
|
172
|
+
left_i = self._prepare_value(left.get(i))
|
|
173
|
+
right_i = self._prepare_value(right.get(i))
|
|
174
|
+
if left_i is not None and right_i is not None:
|
|
175
|
+
sims.append(self._similarity(left_i, right_i))
|
|
176
|
+
else:
|
|
177
|
+
sims.append(None)
|
|
178
|
+
|
|
179
|
+
return pd.Series(sims, index=left.index)
|
|
180
|
+
|
|
181
|
+
@abc.abstractmethod
|
|
182
|
+
def _prepare_value(self, value: Optional[str]) -> Optional[str]:
|
|
183
|
+
pass
|
|
184
|
+
|
|
185
|
+
@abc.abstractmethod
|
|
186
|
+
def _similarity(self, left: str, right: str) -> float:
|
|
187
|
+
pass
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
class JaroWinklerSim1(StringSim):
|
|
191
|
+
name = "sim_jw1"
|
|
192
|
+
is_binary = True
|
|
193
|
+
input_type = "string"
|
|
194
|
+
output_type = "float"
|
|
195
|
+
is_symmetrical = True
|
|
196
|
+
has_symmetry_importance = True
|
|
197
|
+
|
|
198
|
+
def _prepare_value(self, value: Optional[str]) -> Optional[str]:
|
|
199
|
+
return value
|
|
200
|
+
|
|
201
|
+
def _similarity(self, left: str, right: str) -> float:
|
|
202
|
+
return jarowinkler_similarity(left, right)
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
class JaroWinklerSim2(StringSim):
|
|
206
|
+
name = "sim_jw2"
|
|
207
|
+
is_binary = True
|
|
208
|
+
input_type = "string"
|
|
209
|
+
output_type = "float"
|
|
210
|
+
is_symmetrical = True
|
|
211
|
+
has_symmetry_importance = True
|
|
212
|
+
|
|
213
|
+
def _prepare_value(self, value: Optional[str]) -> Optional[str]:
|
|
214
|
+
return value[::-1] if value is not None else None
|
|
215
|
+
|
|
216
|
+
def _similarity(self, left: str, right: str) -> float:
|
|
217
|
+
return jarowinkler_similarity(left, right)
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
class LevenshteinSim(StringSim):
|
|
221
|
+
name = "sim_lv"
|
|
222
|
+
is_binary = True
|
|
223
|
+
input_type = "string"
|
|
224
|
+
output_type = "float"
|
|
225
|
+
is_symmetrical = True
|
|
226
|
+
has_symmetry_importance = True
|
|
227
|
+
|
|
228
|
+
def _prepare_value(self, value: Optional[str]) -> Optional[str]:
|
|
229
|
+
return value
|
|
230
|
+
|
|
231
|
+
def _similarity(self, left: str, right: str) -> float:
|
|
232
|
+
return 1 - Levenshtein.distance(left, right) / max(len(left), len(right))
|
|
@@ -43,6 +43,8 @@ class DateDiff(PandasOperand, DateDiffMixin):
|
|
|
43
43
|
is_binary = True
|
|
44
44
|
has_symmetry_importance = True
|
|
45
45
|
|
|
46
|
+
replace_negative: bool = False
|
|
47
|
+
|
|
46
48
|
def get_params(self) -> Dict[str, Optional[str]]:
|
|
47
49
|
res = super().get_params()
|
|
48
50
|
res.update(
|
|
@@ -50,6 +52,7 @@ class DateDiff(PandasOperand, DateDiffMixin):
|
|
|
50
52
|
"diff_unit": self.diff_unit,
|
|
51
53
|
"left_unit": self.left_unit,
|
|
52
54
|
"right_unit": self.right_unit,
|
|
55
|
+
"replace_negative": self.replace_negative,
|
|
53
56
|
}
|
|
54
57
|
)
|
|
55
58
|
return res
|
|
@@ -61,7 +64,8 @@ class DateDiff(PandasOperand, DateDiffMixin):
|
|
|
61
64
|
return self.__replace_negative(diff)
|
|
62
65
|
|
|
63
66
|
def __replace_negative(self, x: Union[pd.DataFrame, pd.Series]):
|
|
64
|
-
|
|
67
|
+
if self.replace_negative:
|
|
68
|
+
x[x < 0] = None
|
|
65
69
|
return x
|
|
66
70
|
|
|
67
71
|
|
|
@@ -101,13 +105,19 @@ _ext_aggregations = {"nunique": (lambda x: len(np.unique(x)), 0), "count": (len,
|
|
|
101
105
|
class DateListDiff(PandasOperand, DateDiffMixin):
|
|
102
106
|
is_binary = True
|
|
103
107
|
has_symmetry_importance = True
|
|
108
|
+
|
|
104
109
|
aggregation: str
|
|
110
|
+
replace_negative: bool = False
|
|
105
111
|
|
|
106
112
|
def get_params(self) -> Dict[str, Optional[str]]:
|
|
107
113
|
res = super().get_params()
|
|
108
114
|
res.update(
|
|
109
115
|
{
|
|
110
116
|
"aggregation": self.aggregation,
|
|
117
|
+
"diff_unit": self.diff_unit,
|
|
118
|
+
"left_unit": self.left_unit,
|
|
119
|
+
"right_unit": self.right_unit,
|
|
120
|
+
"replace_negative": self.replace_negative,
|
|
111
121
|
}
|
|
112
122
|
)
|
|
113
123
|
return res
|
|
@@ -125,7 +135,7 @@ class DateListDiff(PandasOperand, DateDiffMixin):
|
|
|
125
135
|
|
|
126
136
|
def _diff(self, x: TimedeltaArray):
|
|
127
137
|
x = self._convert_diff_to_unit(x)
|
|
128
|
-
return x[x > 0]
|
|
138
|
+
return x[x > 0] if self.replace_negative else x
|
|
129
139
|
|
|
130
140
|
def _agg(self, x):
|
|
131
141
|
method = getattr(np, self.aggregation, None)
|
|
@@ -157,7 +167,10 @@ class DateListDiffBounded(DateListDiff):
|
|
|
157
167
|
super().__init__(**data)
|
|
158
168
|
|
|
159
169
|
def _agg(self, x):
|
|
160
|
-
x = x[
|
|
170
|
+
x = x[
|
|
171
|
+
(x >= (self.lower_bound if self.lower_bound is not None else -np.inf))
|
|
172
|
+
& (x < (self.upper_bound if self.upper_bound is not None else np.inf))
|
|
173
|
+
]
|
|
161
174
|
return super()._agg(x)
|
|
162
175
|
|
|
163
176
|
|
|
@@ -138,15 +138,17 @@ class Feature:
|
|
|
138
138
|
if self.cached_display_name is not None and cache:
|
|
139
139
|
return self.cached_display_name
|
|
140
140
|
|
|
141
|
+
should_stack_op = not isinstance(self.children[0], Column) if self.op.is_unary else False
|
|
142
|
+
prev_name = [self.children[0].get_op_display_name()] if should_stack_op else []
|
|
143
|
+
|
|
141
144
|
if self.alias:
|
|
142
145
|
components = ["f_autofe", self.alias]
|
|
143
|
-
elif shorten and not self.op.is_unary:
|
|
144
|
-
components = ["f_autofe"
|
|
146
|
+
elif shorten and (not self.op.is_unary or should_stack_op):
|
|
147
|
+
components = ["f_autofe"] + prev_name + [self.get_op_display_name()]
|
|
145
148
|
else:
|
|
146
|
-
components =
|
|
147
|
-
"autofe"
|
|
148
|
-
|
|
149
|
-
]
|
|
149
|
+
components = (
|
|
150
|
+
["f_" + "_f_".join(self.get_columns(**kwargs))] + ["autofe"] + prev_name + [self.get_op_display_name()]
|
|
151
|
+
)
|
|
150
152
|
components.extend([str(self.display_index)] if self.display_index is not None else [])
|
|
151
153
|
display_name = "_".join(components)
|
|
152
154
|
|
|
@@ -237,12 +239,18 @@ class Feature:
|
|
|
237
239
|
|
|
238
240
|
@staticmethod
|
|
239
241
|
def from_formula(string: str) -> Union[Column, "Feature"]:
|
|
240
|
-
if string[-1] != ")":
|
|
241
|
-
return Column(string)
|
|
242
242
|
|
|
243
243
|
def is_trivial_char(c: str) -> bool:
|
|
244
244
|
return c not in "()+-*/,"
|
|
245
245
|
|
|
246
|
+
if string[-1] != ")":
|
|
247
|
+
if all(is_trivial_char(c) for c in string):
|
|
248
|
+
return Column(string)
|
|
249
|
+
else:
|
|
250
|
+
raise ValueError(
|
|
251
|
+
f"Unsupported column name: {string}. Column names should not have characters: ['(', ')', '+', '-', '*', '/', ',']"
|
|
252
|
+
)
|
|
253
|
+
|
|
246
254
|
def find_prev(string: str) -> int:
|
|
247
255
|
if string[-1] != ")":
|
|
248
256
|
return max([(0 if is_trivial_char(c) else i + 1) for i, c in enumerate(string)])
|
|
@@ -264,8 +272,11 @@ class Feature:
|
|
|
264
272
|
return Feature(find_op(string[: p2 - 1]), [Feature.from_formula(string[p2:-1])])
|
|
265
273
|
p1 = find_prev(string[: p2 - 1])
|
|
266
274
|
if string[0] == "(":
|
|
275
|
+
op = find_op(string[p2 - 1])
|
|
276
|
+
if op is None:
|
|
277
|
+
raise ValueError(f"Unsupported operand: {string[p2 - 1]}")
|
|
267
278
|
return Feature(
|
|
268
|
-
|
|
279
|
+
op,
|
|
269
280
|
[Feature.from_formula(string[p1 : p2 - 1]), Feature.from_formula(string[p2:-1])],
|
|
270
281
|
)
|
|
271
282
|
else:
|
|
@@ -276,6 +287,8 @@ class Feature:
|
|
|
276
287
|
[Feature.from_formula(string[p1 : p2 - 1]), Feature.from_formula(string[p2:-1])],
|
|
277
288
|
)
|
|
278
289
|
else:
|
|
290
|
+
if string[p1 - 1] == "(":
|
|
291
|
+
raise ValueError(f"Unsupported operand: {string[: p1 - 1]}")
|
|
279
292
|
base_features = [
|
|
280
293
|
Feature.from_formula(string[p2:-1]),
|
|
281
294
|
Feature.from_formula(string[p1 : p2 - 1]),
|
|
@@ -321,10 +334,10 @@ class FeatureGroup:
|
|
|
321
334
|
lower_order_names = [ch.get_display_name() for ch in lower_order_children]
|
|
322
335
|
if any(isinstance(f, Feature) for f in lower_order_children):
|
|
323
336
|
child_data = pd.concat(
|
|
324
|
-
[data[main_column]] + [ch.calculate(data) for ch in lower_order_children],
|
|
337
|
+
[data[main_column or []]] + [ch.calculate(data) for ch in lower_order_children],
|
|
325
338
|
axis=1,
|
|
326
339
|
)
|
|
327
|
-
child_data.columns = [main_column] + lower_order_names
|
|
340
|
+
child_data.columns = ([main_column] if main_column is not None else []) + lower_order_names
|
|
328
341
|
else:
|
|
329
342
|
child_data = data[columns]
|
|
330
343
|
|
|
@@ -125,3 +125,10 @@ class Norm(PandasOperand):
|
|
|
125
125
|
normalized_data = pd.Series(normalized_data[:, 0], index=data_dropna.index, name=data.name)
|
|
126
126
|
normalized_data = normalized_data.reindex(data.index)
|
|
127
127
|
return normalized_data
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
class Embeddings(PandasOperand):
|
|
131
|
+
name = "emb"
|
|
132
|
+
is_unary = True
|
|
133
|
+
input_type = "string"
|
|
134
|
+
output_type = "vector"
|