upgini 1.1.298__py3-none-any.whl → 1.1.299a3511.dev6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/__about__.py +1 -1
- upgini/autofe/all_operands.py +26 -7
- upgini/autofe/binary.py +91 -2
- upgini/autofe/date.py +16 -3
- upgini/autofe/feature.py +3 -2
- upgini/autofe/unary.py +7 -0
- {upgini-1.1.298.dist-info → upgini-1.1.299a3511.dev6.dist-info}/METADATA +5 -3
- {upgini-1.1.298.dist-info → upgini-1.1.299a3511.dev6.dist-info}/RECORD +10 -10
- {upgini-1.1.298.dist-info → upgini-1.1.299a3511.dev6.dist-info}/WHEEL +0 -0
- {upgini-1.1.298.dist-info → upgini-1.1.299a3511.dev6.dist-info}/licenses/LICENSE +0 -0
upgini/__about__.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "1.1.
|
|
1
|
+
__version__ = "1.1.299a3511.dev6"
|
upgini/autofe/all_operands.py
CHANGED
|
@@ -1,6 +1,20 @@
|
|
|
1
1
|
from typing import Dict
|
|
2
2
|
|
|
3
|
-
from upgini.autofe.binary import
|
|
3
|
+
from upgini.autofe.binary import (
|
|
4
|
+
Add,
|
|
5
|
+
Combine,
|
|
6
|
+
CombineThenFreq,
|
|
7
|
+
Distance,
|
|
8
|
+
Divide,
|
|
9
|
+
JaroWinklerSim1,
|
|
10
|
+
JaroWinklerSim2,
|
|
11
|
+
LevenshteinSim,
|
|
12
|
+
Max,
|
|
13
|
+
Min,
|
|
14
|
+
Multiply,
|
|
15
|
+
Sim,
|
|
16
|
+
Subtract,
|
|
17
|
+
)
|
|
4
18
|
from upgini.autofe.date import (
|
|
5
19
|
DateDiff,
|
|
6
20
|
DateDiffType2,
|
|
@@ -9,9 +23,9 @@ from upgini.autofe.date import (
|
|
|
9
23
|
DatePercentile,
|
|
10
24
|
DatePercentileMethod2,
|
|
11
25
|
)
|
|
12
|
-
from upgini.autofe.groupby import GroupByThenAgg, GroupByThenRank
|
|
26
|
+
from upgini.autofe.groupby import GroupByThenAgg, GroupByThenFreq, GroupByThenNUnique, GroupByThenRank
|
|
13
27
|
from upgini.autofe.operand import Operand
|
|
14
|
-
from upgini.autofe.unary import Abs, Floor, Freq, Log, Residual, Norm, Sigmoid, Sqrt, Square
|
|
28
|
+
from upgini.autofe.unary import Abs, Embeddings, Floor, Freq, Log, Residual, Norm, Sigmoid, Sqrt, Square
|
|
15
29
|
from upgini.autofe.vector import Mean, Sum
|
|
16
30
|
|
|
17
31
|
ALL_OPERANDS: Dict[str, Operand] = {
|
|
@@ -39,10 +53,10 @@ ALL_OPERANDS: Dict[str, Operand] = {
|
|
|
39
53
|
GroupByThenAgg(name="GroupByThenMedian", agg="median"),
|
|
40
54
|
GroupByThenAgg(name="GroupByThenStd", output_type="float", agg="std"),
|
|
41
55
|
GroupByThenRank(),
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
56
|
+
Combine(),
|
|
57
|
+
CombineThenFreq(),
|
|
58
|
+
GroupByThenNUnique(),
|
|
59
|
+
GroupByThenFreq(),
|
|
46
60
|
Sim(),
|
|
47
61
|
DateDiff(),
|
|
48
62
|
DateDiffType2(),
|
|
@@ -59,6 +73,11 @@ ALL_OPERANDS: Dict[str, Operand] = {
|
|
|
59
73
|
DatePercentile(),
|
|
60
74
|
DatePercentileMethod2(),
|
|
61
75
|
Norm(),
|
|
76
|
+
JaroWinklerSim1(),
|
|
77
|
+
JaroWinklerSim2(),
|
|
78
|
+
LevenshteinSim(),
|
|
79
|
+
Distance(),
|
|
80
|
+
Embeddings(),
|
|
62
81
|
]
|
|
63
82
|
}
|
|
64
83
|
|
upgini/autofe/binary.py
CHANGED
|
@@ -1,7 +1,11 @@
|
|
|
1
|
+
import abc
|
|
2
|
+
from typing import Optional
|
|
3
|
+
import Levenshtein
|
|
1
4
|
import numpy as np
|
|
2
5
|
import pandas as pd
|
|
3
6
|
from numpy import dot
|
|
4
7
|
from numpy.linalg import norm
|
|
8
|
+
from jarowinkler import jarowinkler_similarity
|
|
5
9
|
|
|
6
10
|
from upgini.autofe.operand import PandasOperand, VectorizableMixin
|
|
7
11
|
|
|
@@ -130,7 +134,25 @@ class CombineThenFreq(PandasOperand):
|
|
|
130
134
|
self._loc(temp, value_counts)
|
|
131
135
|
|
|
132
136
|
|
|
133
|
-
class
|
|
137
|
+
class Distance(PandasOperand):
|
|
138
|
+
name = "dist"
|
|
139
|
+
is_binary = True
|
|
140
|
+
output_type = "float"
|
|
141
|
+
is_symmetrical = True
|
|
142
|
+
has_symmetry_importance = True
|
|
143
|
+
|
|
144
|
+
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
145
|
+
return pd.Series(
|
|
146
|
+
1 - self.__dot(left, right) / (self.__dot(left, left) * self.__dot(right, right)), index=left.index
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
# row-wise dot product
|
|
150
|
+
def __dot(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
151
|
+
return (left * right).apply(np.sum)
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
# Left for backward compatibility
|
|
155
|
+
class Sim(Distance):
|
|
134
156
|
name = "sim"
|
|
135
157
|
is_binary = True
|
|
136
158
|
output_type = "float"
|
|
@@ -138,4 +160,71 @@ class Sim(PandasOperand):
|
|
|
138
160
|
has_symmetry_importance = True
|
|
139
161
|
|
|
140
162
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
141
|
-
return
|
|
163
|
+
return 1 - super().calculate_binary(left, right)
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
class StringSim(PandasOperand, abc.ABC):
|
|
167
|
+
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
168
|
+
sims = []
|
|
169
|
+
for i in left.index:
|
|
170
|
+
left_i = self._prepare_value(left.get(i))
|
|
171
|
+
right_i = self._prepare_value(right.get(i))
|
|
172
|
+
if left_i is not None and right_i is not None:
|
|
173
|
+
sims.append(self._similarity(left_i, right_i))
|
|
174
|
+
else:
|
|
175
|
+
sims.append(None)
|
|
176
|
+
|
|
177
|
+
return pd.Series(sims, index=left.index)
|
|
178
|
+
|
|
179
|
+
@abc.abstractmethod
|
|
180
|
+
def _prepare_value(self, value: Optional[str]) -> Optional[str]:
|
|
181
|
+
pass
|
|
182
|
+
|
|
183
|
+
@abc.abstractmethod
|
|
184
|
+
def _similarity(self, left: str, right: str) -> float:
|
|
185
|
+
pass
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
class JaroWinklerSim1(StringSim):
|
|
189
|
+
name = "sim_jw1"
|
|
190
|
+
is_binary = True
|
|
191
|
+
input_type = "string"
|
|
192
|
+
output_type = "float"
|
|
193
|
+
is_symmetrical = True
|
|
194
|
+
has_symmetry_importance = True
|
|
195
|
+
|
|
196
|
+
def _prepare_value(self, value: Optional[str]) -> Optional[str]:
|
|
197
|
+
return value
|
|
198
|
+
|
|
199
|
+
def _similarity(self, left: str, right: str) -> float:
|
|
200
|
+
return jarowinkler_similarity(left, right)
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
class JaroWinklerSim2(StringSim):
|
|
204
|
+
name = "sim_jw2"
|
|
205
|
+
is_binary = True
|
|
206
|
+
input_type = "string"
|
|
207
|
+
output_type = "float"
|
|
208
|
+
is_symmetrical = True
|
|
209
|
+
has_symmetry_importance = True
|
|
210
|
+
|
|
211
|
+
def _prepare_value(self, value: Optional[str]) -> Optional[str]:
|
|
212
|
+
return value[::-1] if value is not None else None
|
|
213
|
+
|
|
214
|
+
def _similarity(self, left: str, right: str) -> float:
|
|
215
|
+
return jarowinkler_similarity(left, right)
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
class LevenshteinSim(StringSim):
|
|
219
|
+
name = "sim_lv"
|
|
220
|
+
is_binary = True
|
|
221
|
+
input_type = "string"
|
|
222
|
+
output_type = "float"
|
|
223
|
+
is_symmetrical = True
|
|
224
|
+
has_symmetry_importance = True
|
|
225
|
+
|
|
226
|
+
def _prepare_value(self, value: Optional[str]) -> Optional[str]:
|
|
227
|
+
return value
|
|
228
|
+
|
|
229
|
+
def _similarity(self, left: str, right: str) -> float:
|
|
230
|
+
return 1 - Levenshtein.distance(left, right) / max(len(left), len(right))
|
upgini/autofe/date.py
CHANGED
|
@@ -43,6 +43,8 @@ class DateDiff(PandasOperand, DateDiffMixin):
|
|
|
43
43
|
is_binary = True
|
|
44
44
|
has_symmetry_importance = True
|
|
45
45
|
|
|
46
|
+
replace_negative: bool = False
|
|
47
|
+
|
|
46
48
|
def get_params(self) -> Dict[str, Optional[str]]:
|
|
47
49
|
res = super().get_params()
|
|
48
50
|
res.update(
|
|
@@ -50,6 +52,7 @@ class DateDiff(PandasOperand, DateDiffMixin):
|
|
|
50
52
|
"diff_unit": self.diff_unit,
|
|
51
53
|
"left_unit": self.left_unit,
|
|
52
54
|
"right_unit": self.right_unit,
|
|
55
|
+
"replace_negative": self.replace_negative,
|
|
53
56
|
}
|
|
54
57
|
)
|
|
55
58
|
return res
|
|
@@ -61,7 +64,8 @@ class DateDiff(PandasOperand, DateDiffMixin):
|
|
|
61
64
|
return self.__replace_negative(diff)
|
|
62
65
|
|
|
63
66
|
def __replace_negative(self, x: Union[pd.DataFrame, pd.Series]):
|
|
64
|
-
|
|
67
|
+
if self.replace_negative:
|
|
68
|
+
x[x < 0] = None
|
|
65
69
|
return x
|
|
66
70
|
|
|
67
71
|
|
|
@@ -101,13 +105,19 @@ _ext_aggregations = {"nunique": (lambda x: len(np.unique(x)), 0), "count": (len,
|
|
|
101
105
|
class DateListDiff(PandasOperand, DateDiffMixin):
|
|
102
106
|
is_binary = True
|
|
103
107
|
has_symmetry_importance = True
|
|
108
|
+
|
|
104
109
|
aggregation: str
|
|
110
|
+
replace_negative: bool = False
|
|
105
111
|
|
|
106
112
|
def get_params(self) -> Dict[str, Optional[str]]:
|
|
107
113
|
res = super().get_params()
|
|
108
114
|
res.update(
|
|
109
115
|
{
|
|
110
116
|
"aggregation": self.aggregation,
|
|
117
|
+
"diff_unit": self.diff_unit,
|
|
118
|
+
"left_unit": self.left_unit,
|
|
119
|
+
"right_unit": self.right_unit,
|
|
120
|
+
"replace_negative": self.replace_negative,
|
|
111
121
|
}
|
|
112
122
|
)
|
|
113
123
|
return res
|
|
@@ -125,7 +135,7 @@ class DateListDiff(PandasOperand, DateDiffMixin):
|
|
|
125
135
|
|
|
126
136
|
def _diff(self, x: TimedeltaArray):
|
|
127
137
|
x = self._convert_diff_to_unit(x)
|
|
128
|
-
return x[x > 0]
|
|
138
|
+
return x[x > 0] if self.replace_negative else x
|
|
129
139
|
|
|
130
140
|
def _agg(self, x):
|
|
131
141
|
method = getattr(np, self.aggregation, None)
|
|
@@ -157,7 +167,10 @@ class DateListDiffBounded(DateListDiff):
|
|
|
157
167
|
super().__init__(**data)
|
|
158
168
|
|
|
159
169
|
def _agg(self, x):
|
|
160
|
-
x = x[
|
|
170
|
+
x = x[
|
|
171
|
+
(x >= (self.lower_bound if self.lower_bound is not None else -np.inf))
|
|
172
|
+
& (x < (self.upper_bound if self.upper_bound is not None else np.inf))
|
|
173
|
+
]
|
|
161
174
|
return super()._agg(x)
|
|
162
175
|
|
|
163
176
|
|
upgini/autofe/feature.py
CHANGED
|
@@ -140,8 +140,9 @@ class Feature:
|
|
|
140
140
|
|
|
141
141
|
if self.alias:
|
|
142
142
|
components = ["f_autofe", self.alias]
|
|
143
|
-
elif shorten and not self.op.is_unary:
|
|
144
|
-
|
|
143
|
+
elif shorten and not (self.op.is_unary and all(isinstance(c, Column) for c in self.children)):
|
|
144
|
+
prev_name = [self.children[0].get_op_display_name()] if self.op.is_unary else []
|
|
145
|
+
components = ["f_autofe"] + prev_name + [self.get_op_display_name()]
|
|
145
146
|
else:
|
|
146
147
|
components = ["f_" + "_f_".join(self.get_columns(**kwargs))] + [
|
|
147
148
|
"autofe",
|
upgini/autofe/unary.py
CHANGED
|
@@ -125,3 +125,10 @@ class Norm(PandasOperand):
|
|
|
125
125
|
normalized_data = pd.Series(normalized_data[:, 0], index=data_dropna.index, name=data.name)
|
|
126
126
|
normalized_data = normalized_data.reindex(data.index)
|
|
127
127
|
return normalized_data
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
class Embeddings(PandasOperand):
|
|
131
|
+
name = "emb"
|
|
132
|
+
is_unary = True
|
|
133
|
+
input_type = "string"
|
|
134
|
+
output_type = "vector"
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: upgini
|
|
3
|
-
Version: 1.1.
|
|
3
|
+
Version: 1.1.299a3511.dev6
|
|
4
4
|
Summary: Intelligent data search & enrichment for Machine Learning
|
|
5
5
|
Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
|
|
6
6
|
Project-URL: Homepage, https://upgini.com/
|
|
@@ -26,6 +26,8 @@ Requires-Python: <3.11,>=3.8
|
|
|
26
26
|
Requires-Dist: catboost>=1.0.3
|
|
27
27
|
Requires-Dist: fastparquet>=0.8.1
|
|
28
28
|
Requires-Dist: ipywidgets>=8.1.0
|
|
29
|
+
Requires-Dist: jarowinkler>=2.0.0
|
|
30
|
+
Requires-Dist: levenshtein>=0.25.1
|
|
29
31
|
Requires-Dist: lightgbm>=3.3.2
|
|
30
32
|
Requires-Dist: numpy>=1.19.0
|
|
31
33
|
Requires-Dist: pandas<3.0.0,>=1.1.0
|
|
@@ -131,7 +133,7 @@ Description-Content-Type: text/markdown
|
|
|
131
133
|
|Consumer Confidence index| 44 |22|-|Monthly|date, country|No
|
|
132
134
|
|World economic indicators|191 |41|-|Monthly|date, country|No
|
|
133
135
|
|Markets data|-|17|-|Monthly|date, datetime|No
|
|
134
|
-
|World mobile & fixed broadband network coverage and
|
|
136
|
+
|World mobile & fixed broadband network coverage and perfomance |167|-|3|Monthly|country, postal/ZIP code|No
|
|
135
137
|
|World demographic data |90|-|2|Annual|country, postal/ZIP code|No
|
|
136
138
|
|World house prices |44|-|3|Annual|country, postal/ZIP code|No
|
|
137
139
|
|Public social media profile data |104|-|-|Monthly|date, email/HEM, phone |Yes
|
|
@@ -840,4 +842,4 @@ Some convenient ways to start contributing are:
|
|
|
840
842
|
- [More perks for registered users](https://profile.upgini.com)
|
|
841
843
|
|
|
842
844
|
<sup>😔 Found mistype or a bug in code snippet? Our bad! <a href="https://github.com/upgini/upgini/issues/new?assignees=&title=readme%2Fbug">
|
|
843
|
-
Please report it here</a></sup>
|
|
845
|
+
Please report it here</a></sup>
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
upgini/__about__.py,sha256=
|
|
1
|
+
upgini/__about__.py,sha256=_0z3wkU1Qyf7uc0tWztaZ9d93IS373XBtHXVE9Apmzw,34
|
|
2
2
|
upgini/__init__.py,sha256=ObEtjFkIssl83qeKNMLpIQygfwK8TzztwiI43YTsAP0,353
|
|
3
3
|
upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
|
|
4
4
|
upgini/dataset.py,sha256=7TLVVhGtjgx_9yaiaIUK3kZSe_R9wg5dY0d4F5qCGM4,45636
|
|
@@ -14,13 +14,13 @@ upgini/version_validator.py,sha256=ddSKUK_-eGJB3NgrqOMoWJU-OxQ253WsNLp8aqJkaIM,1
|
|
|
14
14
|
upgini/ads_management/__init__.py,sha256=qzyisOToVRP-tquAJD1PblZhNtMrOB8FiyF9JvfkvgE,50
|
|
15
15
|
upgini/ads_management/ads_manager.py,sha256=igVbN2jz80Umb2BUJixmJVj-zx8unoKpecVo-R-nGdw,2648
|
|
16
16
|
upgini/autofe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
17
|
-
upgini/autofe/all_operands.py,sha256=
|
|
18
|
-
upgini/autofe/binary.py,sha256=
|
|
19
|
-
upgini/autofe/date.py,sha256=
|
|
20
|
-
upgini/autofe/feature.py,sha256=
|
|
17
|
+
upgini/autofe/all_operands.py,sha256=3LiH9iU-ArGmYpS8FHWH7yCFx40ILfvlSXJlKIa75BQ,2542
|
|
18
|
+
upgini/autofe/binary.py,sha256=ml0MszLARZqp3UGUqTGsVjT4DD69zTisfBBEqbZ7klU,6767
|
|
19
|
+
upgini/autofe/date.py,sha256=Qq11EGLFHJxy5DQF2V1CBMtH2j4g5RpinRcw-7SobMs,8442
|
|
20
|
+
upgini/autofe/feature.py,sha256=cPbLJYAfzT8VqMDOGuEOBslJEDTdVphozQf6fCD8uuk,13587
|
|
21
21
|
upgini/autofe/groupby.py,sha256=4WjDzQxqpZxB79Ih4ihMMI5GDxaFqiH6ZelfV82ClT4,3091
|
|
22
22
|
upgini/autofe/operand.py,sha256=MKEsl3zxpWzRDpTkE0sNJxTu62U20sWOvEKhPjUWS6s,2915
|
|
23
|
-
upgini/autofe/unary.py,sha256=
|
|
23
|
+
upgini/autofe/unary.py,sha256=B4wp8oKnlJ0nUng-DRMKSiF8MHlhAFYbgmo9Nd_0ZaA,3777
|
|
24
24
|
upgini/autofe/vector.py,sha256=dLxfAstJs-gw_OQ1xxoxcM6pVzORlV0HVzdzt7cLXVQ,606
|
|
25
25
|
upgini/data_source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
26
26
|
upgini/data_source/data_source_publisher.py,sha256=1cQZrK630VztwGGDp41ec9gqIeUtkefaqSSQEitVWiM,19581
|
|
@@ -57,7 +57,7 @@ upgini/utils/sklearn_ext.py,sha256=13jQS_k7v0aUtudXV6nGUEWjttPQzAW9AFYL5wgEz9k,4
|
|
|
57
57
|
upgini/utils/target_utils.py,sha256=Y96_PJ5cC-WsEbeqg20v9uqywDQobLoTb-xoP7S3o4E,7807
|
|
58
58
|
upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
|
|
59
59
|
upgini/utils/warning_counter.py,sha256=dIWBB4dI5XRRJZudvIlqlIYKEiwLLPcXarsZuYRt338,227
|
|
60
|
-
upgini-1.1.
|
|
61
|
-
upgini-1.1.
|
|
62
|
-
upgini-1.1.
|
|
63
|
-
upgini-1.1.
|
|
60
|
+
upgini-1.1.299a3511.dev6.dist-info/METADATA,sha256=KzZj0GPmhe4dHrujcrKXrqe3xtQCN7OMGYPUjLKJGpA,48230
|
|
61
|
+
upgini-1.1.299a3511.dev6.dist-info/WHEEL,sha256=zEMcRr9Kr03x1ozGwg5v9NQBKn3kndp6LSoSlVg-jhU,87
|
|
62
|
+
upgini-1.1.299a3511.dev6.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
|
63
|
+
upgini-1.1.299a3511.dev6.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|