upgini 1.1.295__tar.gz → 1.1.296a3511.dev1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/PKG-INFO +1 -1
- upgini-1.1.296a3511.dev1/src/upgini/__about__.py +1 -0
- {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/src/upgini/autofe/all_operands.py +24 -6
- {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/src/upgini/autofe/binary.py +81 -2
- upgini-1.1.295/src/upgini/__about__.py +0 -1
- {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/.gitignore +0 -0
- {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/LICENSE +0 -0
- {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/README.md +0 -0
- {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/pyproject.toml +0 -0
- {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/src/upgini/__init__.py +0 -0
- {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/src/upgini/ads.py +0 -0
- {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/src/upgini/ads_management/__init__.py +0 -0
- {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/src/upgini/ads_management/ads_manager.py +0 -0
- {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/src/upgini/autofe/__init__.py +0 -0
- {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/src/upgini/autofe/date.py +0 -0
- {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/src/upgini/autofe/feature.py +0 -0
- {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/src/upgini/autofe/groupby.py +0 -0
- {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/src/upgini/autofe/operand.py +0 -0
- {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/src/upgini/autofe/unary.py +0 -0
- {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/src/upgini/autofe/vector.py +0 -0
- {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/src/upgini/data_source/__init__.py +0 -0
- {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/src/upgini/data_source/data_source_publisher.py +0 -0
- {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/src/upgini/dataset.py +0 -0
- {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/src/upgini/errors.py +0 -0
- {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/src/upgini/features_enricher.py +0 -0
- {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/src/upgini/http.py +0 -0
- {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/src/upgini/lazy_import.py +0 -0
- {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/src/upgini/mdc/__init__.py +0 -0
- {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/src/upgini/mdc/context.py +0 -0
- {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/src/upgini/metadata.py +0 -0
- {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/src/upgini/metrics.py +0 -0
- {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/src/upgini/normalizer/__init__.py +0 -0
- {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/src/upgini/normalizer/phone_normalizer.py +0 -0
- {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/src/upgini/resource_bundle/__init__.py +0 -0
- {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/src/upgini/resource_bundle/exceptions.py +0 -0
- {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/src/upgini/resource_bundle/strings.properties +0 -0
- {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/src/upgini/resource_bundle/strings_widget.properties +0 -0
- {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/src/upgini/sampler/__init__.py +0 -0
- {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/src/upgini/sampler/base.py +0 -0
- {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/src/upgini/sampler/random_under_sampler.py +0 -0
- {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/src/upgini/sampler/utils.py +0 -0
- {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/src/upgini/search_task.py +0 -0
- {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/src/upgini/spinner.py +0 -0
- {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/src/upgini/utils/__init__.py +0 -0
- {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/src/upgini/utils/base_search_key_detector.py +0 -0
- {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/src/upgini/utils/blocked_time_series.py +0 -0
- {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/src/upgini/utils/country_utils.py +0 -0
- {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/src/upgini/utils/custom_loss_utils.py +0 -0
- {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/src/upgini/utils/cv_utils.py +0 -0
- {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/src/upgini/utils/datetime_utils.py +0 -0
- {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/src/upgini/utils/deduplicate_utils.py +0 -0
- {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/src/upgini/utils/display_utils.py +0 -0
- {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/src/upgini/utils/email_utils.py +0 -0
- {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/src/upgini/utils/fallback_progress_bar.py +0 -0
- {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/src/upgini/utils/features_validator.py +0 -0
- {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/src/upgini/utils/format.py +0 -0
- {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/src/upgini/utils/ip_utils.py +0 -0
- {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/src/upgini/utils/phone_utils.py +0 -0
- {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/src/upgini/utils/postal_code_utils.py +0 -0
- {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/src/upgini/utils/progress_bar.py +0 -0
- {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/src/upgini/utils/sklearn_ext.py +0 -0
- {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/src/upgini/utils/target_utils.py +0 -0
- {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/src/upgini/utils/track_info.py +0 -0
- {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/src/upgini/utils/warning_counter.py +0 -0
- {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/src/upgini/version_validator.py +0 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "1.1.296a3511.dev1"
|
|
@@ -1,8 +1,22 @@
|
|
|
1
1
|
from typing import Dict
|
|
2
2
|
|
|
3
|
-
from upgini.autofe.binary import
|
|
3
|
+
from upgini.autofe.binary import (
|
|
4
|
+
Add,
|
|
5
|
+
Combine,
|
|
6
|
+
CombineThenFreq,
|
|
7
|
+
Distance,
|
|
8
|
+
Divide,
|
|
9
|
+
JaroWinklerSim1,
|
|
10
|
+
JaroWinklerSim2,
|
|
11
|
+
LevenshteinSim,
|
|
12
|
+
Max,
|
|
13
|
+
Min,
|
|
14
|
+
Multiply,
|
|
15
|
+
Sim,
|
|
16
|
+
Subtract,
|
|
17
|
+
)
|
|
4
18
|
from upgini.autofe.date import DateDiff, DateDiffType2, DateListDiff, DateListDiffBounded, DatePercentile
|
|
5
|
-
from upgini.autofe.groupby import GroupByThenAgg, GroupByThenRank
|
|
19
|
+
from upgini.autofe.groupby import GroupByThenAgg, GroupByThenFreq, GroupByThenNUnique, GroupByThenRank
|
|
6
20
|
from upgini.autofe.operand import Operand
|
|
7
21
|
from upgini.autofe.unary import Abs, Floor, Freq, Log, Residual, Norm, Sigmoid, Sqrt, Square
|
|
8
22
|
from upgini.autofe.vector import Mean, Sum
|
|
@@ -32,10 +46,10 @@ ALL_OPERANDS: Dict[str, Operand] = {
|
|
|
32
46
|
GroupByThenAgg(name="GroupByThenMedian", agg="median"),
|
|
33
47
|
GroupByThenAgg(name="GroupByThenStd", output_type="float", agg="std"),
|
|
34
48
|
GroupByThenRank(),
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
49
|
+
Combine(),
|
|
50
|
+
CombineThenFreq(),
|
|
51
|
+
GroupByThenNUnique(),
|
|
52
|
+
GroupByThenFreq(),
|
|
39
53
|
Sim(),
|
|
40
54
|
DateDiff(),
|
|
41
55
|
DateDiffType2(),
|
|
@@ -51,6 +65,10 @@ ALL_OPERANDS: Dict[str, Operand] = {
|
|
|
51
65
|
DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=60),
|
|
52
66
|
DatePercentile(),
|
|
53
67
|
Norm(),
|
|
68
|
+
JaroWinklerSim1(),
|
|
69
|
+
JaroWinklerSim2(),
|
|
70
|
+
LevenshteinSim(),
|
|
71
|
+
Distance(),
|
|
54
72
|
]
|
|
55
73
|
}
|
|
56
74
|
|
|
@@ -1,7 +1,11 @@
|
|
|
1
|
+
import abc
|
|
2
|
+
from typing import Optional
|
|
3
|
+
import Levenshtein
|
|
1
4
|
import numpy as np
|
|
2
5
|
import pandas as pd
|
|
3
6
|
from numpy import dot
|
|
4
7
|
from numpy.linalg import norm
|
|
8
|
+
from jarowinkler import jarowinkler_similarity
|
|
5
9
|
|
|
6
10
|
from upgini.autofe.operand import PandasOperand, VectorizableMixin
|
|
7
11
|
|
|
@@ -130,8 +134,8 @@ class CombineThenFreq(PandasOperand):
|
|
|
130
134
|
self._loc(temp, value_counts)
|
|
131
135
|
|
|
132
136
|
|
|
133
|
-
class
|
|
134
|
-
name = "
|
|
137
|
+
class Distance(PandasOperand):
|
|
138
|
+
name = "dist"
|
|
135
139
|
is_binary = True
|
|
136
140
|
output_type = "float"
|
|
137
141
|
is_symmetrical = True
|
|
@@ -139,3 +143,78 @@ class Sim(PandasOperand):
|
|
|
139
143
|
|
|
140
144
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
141
145
|
return dot(left, right) / (norm(left) * norm(right))
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
class Sim(Distance):
|
|
149
|
+
name = "sim"
|
|
150
|
+
is_binary = True
|
|
151
|
+
output_type = "float"
|
|
152
|
+
is_symmetrical = True
|
|
153
|
+
has_symmetry_importance = True
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
class StringSim(PandasOperand, abc.ABC):
|
|
157
|
+
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
158
|
+
sims = []
|
|
159
|
+
for i in left.index:
|
|
160
|
+
left_i = self._prepare_value(left.get(i))
|
|
161
|
+
right_i = self._prepare_value(right.get(i))
|
|
162
|
+
if left_i is not None and right_i is not None:
|
|
163
|
+
sims.append(self._similarity(left_i, right_i))
|
|
164
|
+
else:
|
|
165
|
+
sims.append(None)
|
|
166
|
+
|
|
167
|
+
return pd.Series(sims, index=left.index)
|
|
168
|
+
|
|
169
|
+
@abc.abstractmethod
|
|
170
|
+
def _prepare_value(self, value: Optional[str]) -> Optional[str]:
|
|
171
|
+
pass
|
|
172
|
+
|
|
173
|
+
@abc.abstractmethod
|
|
174
|
+
def _similarity(self, left: str, right: str) -> float:
|
|
175
|
+
pass
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
class JaroWinklerSim1(StringSim):
|
|
179
|
+
name = "sim_jw1"
|
|
180
|
+
is_binary = True
|
|
181
|
+
input_type = "string"
|
|
182
|
+
output_type = "float"
|
|
183
|
+
is_symmetrical = True
|
|
184
|
+
has_symmetry_importance = True
|
|
185
|
+
|
|
186
|
+
def _prepare_value(self, value: Optional[str]) -> Optional[str]:
|
|
187
|
+
return value
|
|
188
|
+
|
|
189
|
+
def _similarity(self, left: str, right: str) -> float:
|
|
190
|
+
return jarowinkler_similarity(left, right)
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
class JaroWinklerSim2(StringSim):
|
|
194
|
+
name = "sim_jw2"
|
|
195
|
+
is_binary = True
|
|
196
|
+
input_type = "string"
|
|
197
|
+
output_type = "float"
|
|
198
|
+
is_symmetrical = True
|
|
199
|
+
has_symmetry_importance = True
|
|
200
|
+
|
|
201
|
+
def _prepare_value(self, value: Optional[str]) -> Optional[str]:
|
|
202
|
+
return value[::-1] if value is not None else None
|
|
203
|
+
|
|
204
|
+
def _similarity(self, left: str, right: str) -> float:
|
|
205
|
+
return jarowinkler_similarity(left, right)
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
class LevenshteinSim(StringSim):
|
|
209
|
+
name = "sim_lv"
|
|
210
|
+
is_binary = True
|
|
211
|
+
input_type = "string"
|
|
212
|
+
output_type = "float"
|
|
213
|
+
is_symmetrical = True
|
|
214
|
+
has_symmetry_importance = True
|
|
215
|
+
|
|
216
|
+
def _prepare_value(self, value: Optional[str]) -> Optional[str]:
|
|
217
|
+
return value
|
|
218
|
+
|
|
219
|
+
def _similarity(self, left: str, right: str) -> float:
|
|
220
|
+
return 1 - Levenshtein.distance(left, right) / max(len(left), len(right))
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
__version__ = "1.1.295"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{upgini-1.1.295 → upgini-1.1.296a3511.dev1}/src/upgini/resource_bundle/strings_widget.properties
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|