upgini 1.1.312__py3-none-any.whl → 1.1.312a2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/__about__.py +1 -1
- upgini/autofe/all_operands.py +7 -26
- upgini/autofe/binary.py +4 -95
- upgini/autofe/date.py +3 -16
- upgini/autofe/feature.py +11 -25
- upgini/autofe/unary.py +0 -7
- upgini/dataset.py +30 -385
- upgini/features_enricher.py +276 -120
- upgini/metadata.py +16 -1
- upgini/normalizer/normalize_utils.py +203 -0
- upgini/utils/country_utils.py +16 -0
- upgini/utils/datetime_utils.py +34 -15
- upgini/utils/email_utils.py +19 -5
- upgini/utils/ip_utils.py +100 -1
- upgini/utils/phone_utils.py +345 -0
- upgini/utils/postal_code_utils.py +34 -0
- {upgini-1.1.312.dist-info → upgini-1.1.312a2.dist-info}/METADATA +1 -3
- {upgini-1.1.312.dist-info → upgini-1.1.312a2.dist-info}/RECORD +20 -20
- {upgini-1.1.312.dist-info → upgini-1.1.312a2.dist-info}/WHEEL +1 -1
- upgini/normalizer/phone_normalizer.py +0 -340
- {upgini-1.1.312.dist-info → upgini-1.1.312a2.dist-info}/licenses/LICENSE +0 -0
upgini/__about__.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "1.1.
|
|
1
|
+
__version__ = "1.1.312a2"
|
upgini/autofe/all_operands.py
CHANGED
|
@@ -1,20 +1,6 @@
|
|
|
1
1
|
from typing import Dict
|
|
2
2
|
|
|
3
|
-
from upgini.autofe.binary import
|
|
4
|
-
Add,
|
|
5
|
-
Combine,
|
|
6
|
-
CombineThenFreq,
|
|
7
|
-
Distance,
|
|
8
|
-
Divide,
|
|
9
|
-
JaroWinklerSim1,
|
|
10
|
-
JaroWinklerSim2,
|
|
11
|
-
LevenshteinSim,
|
|
12
|
-
Max,
|
|
13
|
-
Min,
|
|
14
|
-
Multiply,
|
|
15
|
-
Sim,
|
|
16
|
-
Subtract,
|
|
17
|
-
)
|
|
3
|
+
from upgini.autofe.binary import Add, Divide, Max, Min, Multiply, Sim, Subtract
|
|
18
4
|
from upgini.autofe.date import (
|
|
19
5
|
DateDiff,
|
|
20
6
|
DateDiffType2,
|
|
@@ -23,9 +9,9 @@ from upgini.autofe.date import (
|
|
|
23
9
|
DatePercentile,
|
|
24
10
|
DatePercentileMethod2,
|
|
25
11
|
)
|
|
26
|
-
from upgini.autofe.groupby import GroupByThenAgg,
|
|
12
|
+
from upgini.autofe.groupby import GroupByThenAgg, GroupByThenRank
|
|
27
13
|
from upgini.autofe.operand import Operand
|
|
28
|
-
from upgini.autofe.unary import Abs,
|
|
14
|
+
from upgini.autofe.unary import Abs, Floor, Freq, Log, Residual, Norm, Sigmoid, Sqrt, Square
|
|
29
15
|
from upgini.autofe.vector import Mean, Sum
|
|
30
16
|
|
|
31
17
|
ALL_OPERANDS: Dict[str, Operand] = {
|
|
@@ -53,10 +39,10 @@ ALL_OPERANDS: Dict[str, Operand] = {
|
|
|
53
39
|
GroupByThenAgg(name="GroupByThenMedian", agg="median"),
|
|
54
40
|
GroupByThenAgg(name="GroupByThenStd", output_type="float", agg="std"),
|
|
55
41
|
GroupByThenRank(),
|
|
56
|
-
Combine
|
|
57
|
-
CombineThenFreq
|
|
58
|
-
GroupByThenNUnique
|
|
59
|
-
GroupByThenFreq
|
|
42
|
+
Operand(name="Combine", has_symmetry_importance=True, output_type="object", is_categorical=True),
|
|
43
|
+
Operand(name="CombineThenFreq", has_symmetry_importance=True, output_type="float"),
|
|
44
|
+
Operand(name="GroupByThenNUnique", output_type="int", is_vectorizable=True, is_grouping=True),
|
|
45
|
+
Operand(name="GroupByThenFreq", output_type="float", is_grouping=True),
|
|
60
46
|
Sim(),
|
|
61
47
|
DateDiff(),
|
|
62
48
|
DateDiffType2(),
|
|
@@ -73,11 +59,6 @@ ALL_OPERANDS: Dict[str, Operand] = {
|
|
|
73
59
|
DatePercentile(),
|
|
74
60
|
DatePercentileMethod2(),
|
|
75
61
|
Norm(),
|
|
76
|
-
JaroWinklerSim1(),
|
|
77
|
-
JaroWinklerSim2(),
|
|
78
|
-
LevenshteinSim(),
|
|
79
|
-
Distance(),
|
|
80
|
-
Embeddings(),
|
|
81
62
|
]
|
|
82
63
|
}
|
|
83
64
|
|
upgini/autofe/binary.py
CHANGED
|
@@ -1,9 +1,7 @@
|
|
|
1
|
-
import abc
|
|
2
|
-
from typing import Optional
|
|
3
|
-
import Levenshtein
|
|
4
1
|
import numpy as np
|
|
5
2
|
import pandas as pd
|
|
6
|
-
from
|
|
3
|
+
from numpy import dot
|
|
4
|
+
from numpy.linalg import norm
|
|
7
5
|
|
|
8
6
|
from upgini.autofe.operand import PandasOperand, VectorizableMixin
|
|
9
7
|
|
|
@@ -132,29 +130,7 @@ class CombineThenFreq(PandasOperand):
|
|
|
132
130
|
self._loc(temp, value_counts)
|
|
133
131
|
|
|
134
132
|
|
|
135
|
-
class
|
|
136
|
-
name = "dist"
|
|
137
|
-
is_binary = True
|
|
138
|
-
output_type = "float"
|
|
139
|
-
is_symmetrical = True
|
|
140
|
-
has_symmetry_importance = True
|
|
141
|
-
|
|
142
|
-
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
143
|
-
return pd.Series(
|
|
144
|
-
1 - self.__dot(left, right) / (self.__dot(left, left) * self.__dot(right, right)), index=left.index
|
|
145
|
-
)
|
|
146
|
-
|
|
147
|
-
# row-wise dot product
|
|
148
|
-
def __dot(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
149
|
-
left = left.apply(lambda x: np.array(x))
|
|
150
|
-
right = right.apply(lambda x: np.array(x))
|
|
151
|
-
res = (left.dropna() * right.dropna()).apply(np.sum)
|
|
152
|
-
res = res.reindex(left.index.union(right.index))
|
|
153
|
-
return res
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
# Left for backward compatibility
|
|
157
|
-
class Sim(Distance):
|
|
133
|
+
class Sim(PandasOperand):
|
|
158
134
|
name = "sim"
|
|
159
135
|
is_binary = True
|
|
160
136
|
output_type = "float"
|
|
@@ -162,71 +138,4 @@ class Sim(Distance):
|
|
|
162
138
|
has_symmetry_importance = True
|
|
163
139
|
|
|
164
140
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
165
|
-
return
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
class StringSim(PandasOperand, abc.ABC):
|
|
169
|
-
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
170
|
-
sims = []
|
|
171
|
-
for i in left.index:
|
|
172
|
-
left_i = self._prepare_value(left.get(i))
|
|
173
|
-
right_i = self._prepare_value(right.get(i))
|
|
174
|
-
if left_i is not None and right_i is not None:
|
|
175
|
-
sims.append(self._similarity(left_i, right_i))
|
|
176
|
-
else:
|
|
177
|
-
sims.append(None)
|
|
178
|
-
|
|
179
|
-
return pd.Series(sims, index=left.index)
|
|
180
|
-
|
|
181
|
-
@abc.abstractmethod
|
|
182
|
-
def _prepare_value(self, value: Optional[str]) -> Optional[str]:
|
|
183
|
-
pass
|
|
184
|
-
|
|
185
|
-
@abc.abstractmethod
|
|
186
|
-
def _similarity(self, left: str, right: str) -> float:
|
|
187
|
-
pass
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
class JaroWinklerSim1(StringSim):
|
|
191
|
-
name = "sim_jw1"
|
|
192
|
-
is_binary = True
|
|
193
|
-
input_type = "string"
|
|
194
|
-
output_type = "float"
|
|
195
|
-
is_symmetrical = True
|
|
196
|
-
has_symmetry_importance = True
|
|
197
|
-
|
|
198
|
-
def _prepare_value(self, value: Optional[str]) -> Optional[str]:
|
|
199
|
-
return value
|
|
200
|
-
|
|
201
|
-
def _similarity(self, left: str, right: str) -> float:
|
|
202
|
-
return jarowinkler_similarity(left, right)
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
class JaroWinklerSim2(StringSim):
|
|
206
|
-
name = "sim_jw2"
|
|
207
|
-
is_binary = True
|
|
208
|
-
input_type = "string"
|
|
209
|
-
output_type = "float"
|
|
210
|
-
is_symmetrical = True
|
|
211
|
-
has_symmetry_importance = True
|
|
212
|
-
|
|
213
|
-
def _prepare_value(self, value: Optional[str]) -> Optional[str]:
|
|
214
|
-
return value[::-1] if value is not None else None
|
|
215
|
-
|
|
216
|
-
def _similarity(self, left: str, right: str) -> float:
|
|
217
|
-
return jarowinkler_similarity(left, right)
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
class LevenshteinSim(StringSim):
|
|
221
|
-
name = "sim_lv"
|
|
222
|
-
is_binary = True
|
|
223
|
-
input_type = "string"
|
|
224
|
-
output_type = "float"
|
|
225
|
-
is_symmetrical = True
|
|
226
|
-
has_symmetry_importance = True
|
|
227
|
-
|
|
228
|
-
def _prepare_value(self, value: Optional[str]) -> Optional[str]:
|
|
229
|
-
return value
|
|
230
|
-
|
|
231
|
-
def _similarity(self, left: str, right: str) -> float:
|
|
232
|
-
return 1 - Levenshtein.distance(left, right) / max(len(left), len(right))
|
|
141
|
+
return dot(left, right) / (norm(left) * norm(right))
|
upgini/autofe/date.py
CHANGED
|
@@ -43,8 +43,6 @@ class DateDiff(PandasOperand, DateDiffMixin):
|
|
|
43
43
|
is_binary = True
|
|
44
44
|
has_symmetry_importance = True
|
|
45
45
|
|
|
46
|
-
replace_negative: bool = False
|
|
47
|
-
|
|
48
46
|
def get_params(self) -> Dict[str, Optional[str]]:
|
|
49
47
|
res = super().get_params()
|
|
50
48
|
res.update(
|
|
@@ -52,7 +50,6 @@ class DateDiff(PandasOperand, DateDiffMixin):
|
|
|
52
50
|
"diff_unit": self.diff_unit,
|
|
53
51
|
"left_unit": self.left_unit,
|
|
54
52
|
"right_unit": self.right_unit,
|
|
55
|
-
"replace_negative": self.replace_negative,
|
|
56
53
|
}
|
|
57
54
|
)
|
|
58
55
|
return res
|
|
@@ -64,8 +61,7 @@ class DateDiff(PandasOperand, DateDiffMixin):
|
|
|
64
61
|
return self.__replace_negative(diff)
|
|
65
62
|
|
|
66
63
|
def __replace_negative(self, x: Union[pd.DataFrame, pd.Series]):
|
|
67
|
-
|
|
68
|
-
x[x < 0] = None
|
|
64
|
+
x[x < 0] = None
|
|
69
65
|
return x
|
|
70
66
|
|
|
71
67
|
|
|
@@ -105,19 +101,13 @@ _ext_aggregations = {"nunique": (lambda x: len(np.unique(x)), 0), "count": (len,
|
|
|
105
101
|
class DateListDiff(PandasOperand, DateDiffMixin):
|
|
106
102
|
is_binary = True
|
|
107
103
|
has_symmetry_importance = True
|
|
108
|
-
|
|
109
104
|
aggregation: str
|
|
110
|
-
replace_negative: bool = False
|
|
111
105
|
|
|
112
106
|
def get_params(self) -> Dict[str, Optional[str]]:
|
|
113
107
|
res = super().get_params()
|
|
114
108
|
res.update(
|
|
115
109
|
{
|
|
116
110
|
"aggregation": self.aggregation,
|
|
117
|
-
"diff_unit": self.diff_unit,
|
|
118
|
-
"left_unit": self.left_unit,
|
|
119
|
-
"right_unit": self.right_unit,
|
|
120
|
-
"replace_negative": self.replace_negative,
|
|
121
111
|
}
|
|
122
112
|
)
|
|
123
113
|
return res
|
|
@@ -135,7 +125,7 @@ class DateListDiff(PandasOperand, DateDiffMixin):
|
|
|
135
125
|
|
|
136
126
|
def _diff(self, x: TimedeltaArray):
|
|
137
127
|
x = self._convert_diff_to_unit(x)
|
|
138
|
-
return x[x > 0]
|
|
128
|
+
return x[x > 0]
|
|
139
129
|
|
|
140
130
|
def _agg(self, x):
|
|
141
131
|
method = getattr(np, self.aggregation, None)
|
|
@@ -167,10 +157,7 @@ class DateListDiffBounded(DateListDiff):
|
|
|
167
157
|
super().__init__(**data)
|
|
168
158
|
|
|
169
159
|
def _agg(self, x):
|
|
170
|
-
x = x[
|
|
171
|
-
(x >= (self.lower_bound if self.lower_bound is not None else -np.inf))
|
|
172
|
-
& (x < (self.upper_bound if self.upper_bound is not None else np.inf))
|
|
173
|
-
]
|
|
160
|
+
x = x[(x >= (self.lower_bound or -np.inf)) & (x < (self.upper_bound or np.inf))]
|
|
174
161
|
return super()._agg(x)
|
|
175
162
|
|
|
176
163
|
|
upgini/autofe/feature.py
CHANGED
|
@@ -138,17 +138,15 @@ class Feature:
|
|
|
138
138
|
if self.cached_display_name is not None and cache:
|
|
139
139
|
return self.cached_display_name
|
|
140
140
|
|
|
141
|
-
should_stack_op = not isinstance(self.children[0], Column) if self.op.is_unary else False
|
|
142
|
-
prev_name = [self.children[0].get_op_display_name()] if should_stack_op else []
|
|
143
|
-
|
|
144
141
|
if self.alias:
|
|
145
142
|
components = ["f_autofe", self.alias]
|
|
146
|
-
elif shorten and
|
|
147
|
-
components = ["f_autofe"
|
|
143
|
+
elif shorten and not self.op.is_unary:
|
|
144
|
+
components = ["f_autofe", self.get_op_display_name()]
|
|
148
145
|
else:
|
|
149
|
-
components = (
|
|
150
|
-
|
|
151
|
-
|
|
146
|
+
components = ["f_" + "_f_".join(self.get_columns(**kwargs))] + [
|
|
147
|
+
"autofe",
|
|
148
|
+
self.get_op_display_name(),
|
|
149
|
+
]
|
|
152
150
|
components.extend([str(self.display_index)] if self.display_index is not None else [])
|
|
153
151
|
display_name = "_".join(components)
|
|
154
152
|
|
|
@@ -239,19 +237,12 @@ class Feature:
|
|
|
239
237
|
|
|
240
238
|
@staticmethod
|
|
241
239
|
def from_formula(string: str) -> Union[Column, "Feature"]:
|
|
240
|
+
if string[-1] != ")":
|
|
241
|
+
return Column(string)
|
|
242
242
|
|
|
243
243
|
def is_trivial_char(c: str) -> bool:
|
|
244
244
|
return c not in "()+-*/,"
|
|
245
245
|
|
|
246
|
-
if string[-1] != ")":
|
|
247
|
-
if all(is_trivial_char(c) for c in string):
|
|
248
|
-
return Column(string)
|
|
249
|
-
else:
|
|
250
|
-
raise ValueError(
|
|
251
|
-
f"Unsupported column name: {string}. Column names should not have characters: "
|
|
252
|
-
"['(', ')', '+', '-', '*', '/', ',']"
|
|
253
|
-
)
|
|
254
|
-
|
|
255
246
|
def find_prev(string: str) -> int:
|
|
256
247
|
if string[-1] != ")":
|
|
257
248
|
return max([(0 if is_trivial_char(c) else i + 1) for i, c in enumerate(string)])
|
|
@@ -273,11 +264,8 @@ class Feature:
|
|
|
273
264
|
return Feature(find_op(string[: p2 - 1]), [Feature.from_formula(string[p2:-1])])
|
|
274
265
|
p1 = find_prev(string[: p2 - 1])
|
|
275
266
|
if string[0] == "(":
|
|
276
|
-
op = find_op(string[p2 - 1])
|
|
277
|
-
if op is None:
|
|
278
|
-
raise ValueError(f"Unsupported operand: {string[p2 - 1]}")
|
|
279
267
|
return Feature(
|
|
280
|
-
|
|
268
|
+
find_op(string[p2 - 1]),
|
|
281
269
|
[Feature.from_formula(string[p1 : p2 - 1]), Feature.from_formula(string[p2:-1])],
|
|
282
270
|
)
|
|
283
271
|
else:
|
|
@@ -288,8 +276,6 @@ class Feature:
|
|
|
288
276
|
[Feature.from_formula(string[p1 : p2 - 1]), Feature.from_formula(string[p2:-1])],
|
|
289
277
|
)
|
|
290
278
|
else:
|
|
291
|
-
if string[p1 - 1] == "(":
|
|
292
|
-
raise ValueError(f"Unsupported operand: {string[: p1 - 1]}")
|
|
293
279
|
base_features = [
|
|
294
280
|
Feature.from_formula(string[p2:-1]),
|
|
295
281
|
Feature.from_formula(string[p1 : p2 - 1]),
|
|
@@ -335,10 +321,10 @@ class FeatureGroup:
|
|
|
335
321
|
lower_order_names = [ch.get_display_name() for ch in lower_order_children]
|
|
336
322
|
if any(isinstance(f, Feature) for f in lower_order_children):
|
|
337
323
|
child_data = pd.concat(
|
|
338
|
-
[data[main_column
|
|
324
|
+
[data[main_column]] + [ch.calculate(data) for ch in lower_order_children],
|
|
339
325
|
axis=1,
|
|
340
326
|
)
|
|
341
|
-
child_data.columns =
|
|
327
|
+
child_data.columns = [main_column] + lower_order_names
|
|
342
328
|
else:
|
|
343
329
|
child_data = data[columns]
|
|
344
330
|
|
upgini/autofe/unary.py
CHANGED
|
@@ -125,10 +125,3 @@ class Norm(PandasOperand):
|
|
|
125
125
|
normalized_data = pd.Series(normalized_data[:, 0], index=data_dropna.index, name=data.name)
|
|
126
126
|
normalized_data = normalized_data.reindex(data.index)
|
|
127
127
|
return normalized_data
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
class Embeddings(PandasOperand):
|
|
131
|
-
name = "emb"
|
|
132
|
-
is_unary = True
|
|
133
|
-
input_type = "string"
|
|
134
|
-
output_type = "vector"
|