upgini 1.1.299a3511.dev9__tar.gz → 1.1.300__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- {upgini-1.1.299a3511.dev9 → upgini-1.1.300}/PKG-INFO +3 -5
- {upgini-1.1.299a3511.dev9 → upgini-1.1.300}/README.md +2 -2
- {upgini-1.1.299a3511.dev9 → upgini-1.1.300}/pyproject.toml +0 -3
- upgini-1.1.300/src/upgini/__about__.py +1 -0
- {upgini-1.1.299a3511.dev9 → upgini-1.1.300}/src/upgini/autofe/all_operands.py +7 -26
- {upgini-1.1.299a3511.dev9 → upgini-1.1.300}/src/upgini/autofe/binary.py +2 -93
- {upgini-1.1.299a3511.dev9 → upgini-1.1.300}/src/upgini/autofe/date.py +3 -16
- {upgini-1.1.299a3511.dev9 → upgini-1.1.300}/src/upgini/autofe/feature.py +8 -10
- {upgini-1.1.299a3511.dev9 → upgini-1.1.300}/src/upgini/autofe/unary.py +0 -7
- {upgini-1.1.299a3511.dev9 → upgini-1.1.300}/src/upgini/dataset.py +11 -2
- {upgini-1.1.299a3511.dev9 → upgini-1.1.300}/src/upgini/features_enricher.py +223 -103
- {upgini-1.1.299a3511.dev9 → upgini-1.1.300}/src/upgini/metadata.py +10 -2
- {upgini-1.1.299a3511.dev9 → upgini-1.1.300}/src/upgini/metrics.py +1 -1
- {upgini-1.1.299a3511.dev9 → upgini-1.1.300}/src/upgini/resource_bundle/strings.properties +1 -0
- upgini-1.1.300/src/upgini/utils/base_search_key_detector.py +27 -0
- {upgini-1.1.299a3511.dev9 → upgini-1.1.300}/src/upgini/utils/deduplicate_utils.py +11 -1
- {upgini-1.1.299a3511.dev9 → upgini-1.1.300}/src/upgini/utils/email_utils.py +5 -0
- upgini-1.1.299a3511.dev9/src/upgini/__about__.py +0 -1
- upgini-1.1.299a3511.dev9/src/upgini/utils/base_search_key_detector.py +0 -25
- {upgini-1.1.299a3511.dev9 → upgini-1.1.300}/.gitignore +0 -0
- {upgini-1.1.299a3511.dev9 → upgini-1.1.300}/LICENSE +0 -0
- {upgini-1.1.299a3511.dev9 → upgini-1.1.300}/src/upgini/__init__.py +0 -0
- {upgini-1.1.299a3511.dev9 → upgini-1.1.300}/src/upgini/ads.py +0 -0
- {upgini-1.1.299a3511.dev9 → upgini-1.1.300}/src/upgini/ads_management/__init__.py +0 -0
- {upgini-1.1.299a3511.dev9 → upgini-1.1.300}/src/upgini/ads_management/ads_manager.py +0 -0
- {upgini-1.1.299a3511.dev9 → upgini-1.1.300}/src/upgini/autofe/__init__.py +0 -0
- {upgini-1.1.299a3511.dev9 → upgini-1.1.300}/src/upgini/autofe/groupby.py +0 -0
- {upgini-1.1.299a3511.dev9 → upgini-1.1.300}/src/upgini/autofe/operand.py +0 -0
- {upgini-1.1.299a3511.dev9 → upgini-1.1.300}/src/upgini/autofe/vector.py +0 -0
- {upgini-1.1.299a3511.dev9 → upgini-1.1.300}/src/upgini/data_source/__init__.py +0 -0
- {upgini-1.1.299a3511.dev9 → upgini-1.1.300}/src/upgini/data_source/data_source_publisher.py +0 -0
- {upgini-1.1.299a3511.dev9 → upgini-1.1.300}/src/upgini/errors.py +0 -0
- {upgini-1.1.299a3511.dev9 → upgini-1.1.300}/src/upgini/http.py +0 -0
- {upgini-1.1.299a3511.dev9 → upgini-1.1.300}/src/upgini/lazy_import.py +0 -0
- {upgini-1.1.299a3511.dev9 → upgini-1.1.300}/src/upgini/mdc/__init__.py +0 -0
- {upgini-1.1.299a3511.dev9 → upgini-1.1.300}/src/upgini/mdc/context.py +0 -0
- {upgini-1.1.299a3511.dev9 → upgini-1.1.300}/src/upgini/normalizer/__init__.py +0 -0
- {upgini-1.1.299a3511.dev9 → upgini-1.1.300}/src/upgini/normalizer/phone_normalizer.py +0 -0
- {upgini-1.1.299a3511.dev9 → upgini-1.1.300}/src/upgini/resource_bundle/__init__.py +0 -0
- {upgini-1.1.299a3511.dev9 → upgini-1.1.300}/src/upgini/resource_bundle/exceptions.py +0 -0
- {upgini-1.1.299a3511.dev9 → upgini-1.1.300}/src/upgini/resource_bundle/strings_widget.properties +0 -0
- {upgini-1.1.299a3511.dev9 → upgini-1.1.300}/src/upgini/sampler/__init__.py +0 -0
- {upgini-1.1.299a3511.dev9 → upgini-1.1.300}/src/upgini/sampler/base.py +0 -0
- {upgini-1.1.299a3511.dev9 → upgini-1.1.300}/src/upgini/sampler/random_under_sampler.py +0 -0
- {upgini-1.1.299a3511.dev9 → upgini-1.1.300}/src/upgini/sampler/utils.py +0 -0
- {upgini-1.1.299a3511.dev9 → upgini-1.1.300}/src/upgini/search_task.py +0 -0
- {upgini-1.1.299a3511.dev9 → upgini-1.1.300}/src/upgini/spinner.py +0 -0
- {upgini-1.1.299a3511.dev9 → upgini-1.1.300}/src/upgini/utils/__init__.py +0 -0
- {upgini-1.1.299a3511.dev9 → upgini-1.1.300}/src/upgini/utils/blocked_time_series.py +0 -0
- {upgini-1.1.299a3511.dev9 → upgini-1.1.300}/src/upgini/utils/country_utils.py +0 -0
- {upgini-1.1.299a3511.dev9 → upgini-1.1.300}/src/upgini/utils/custom_loss_utils.py +0 -0
- {upgini-1.1.299a3511.dev9 → upgini-1.1.300}/src/upgini/utils/cv_utils.py +0 -0
- {upgini-1.1.299a3511.dev9 → upgini-1.1.300}/src/upgini/utils/datetime_utils.py +0 -0
- {upgini-1.1.299a3511.dev9 → upgini-1.1.300}/src/upgini/utils/display_utils.py +0 -0
- {upgini-1.1.299a3511.dev9 → upgini-1.1.300}/src/upgini/utils/fallback_progress_bar.py +0 -0
- {upgini-1.1.299a3511.dev9 → upgini-1.1.300}/src/upgini/utils/features_validator.py +0 -0
- {upgini-1.1.299a3511.dev9 → upgini-1.1.300}/src/upgini/utils/format.py +0 -0
- {upgini-1.1.299a3511.dev9 → upgini-1.1.300}/src/upgini/utils/ip_utils.py +0 -0
- {upgini-1.1.299a3511.dev9 → upgini-1.1.300}/src/upgini/utils/phone_utils.py +0 -0
- {upgini-1.1.299a3511.dev9 → upgini-1.1.300}/src/upgini/utils/postal_code_utils.py +0 -0
- {upgini-1.1.299a3511.dev9 → upgini-1.1.300}/src/upgini/utils/progress_bar.py +0 -0
- {upgini-1.1.299a3511.dev9 → upgini-1.1.300}/src/upgini/utils/sklearn_ext.py +0 -0
- {upgini-1.1.299a3511.dev9 → upgini-1.1.300}/src/upgini/utils/target_utils.py +0 -0
- {upgini-1.1.299a3511.dev9 → upgini-1.1.300}/src/upgini/utils/track_info.py +0 -0
- {upgini-1.1.299a3511.dev9 → upgini-1.1.300}/src/upgini/utils/warning_counter.py +0 -0
- {upgini-1.1.299a3511.dev9 → upgini-1.1.300}/src/upgini/version_validator.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: upgini
|
|
3
|
-
Version: 1.1.
|
|
3
|
+
Version: 1.1.300
|
|
4
4
|
Summary: Intelligent data search & enrichment for Machine Learning
|
|
5
5
|
Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
|
|
6
6
|
Project-URL: Homepage, https://upgini.com/
|
|
@@ -26,8 +26,6 @@ Requires-Python: <3.11,>=3.8
|
|
|
26
26
|
Requires-Dist: catboost>=1.0.3
|
|
27
27
|
Requires-Dist: fastparquet>=0.8.1
|
|
28
28
|
Requires-Dist: ipywidgets>=8.1.0
|
|
29
|
-
Requires-Dist: jarowinkler>=2.0.0
|
|
30
|
-
Requires-Dist: levenshtein>=0.25.1
|
|
31
29
|
Requires-Dist: lightgbm>=3.3.2
|
|
32
30
|
Requires-Dist: numpy>=1.19.0
|
|
33
31
|
Requires-Dist: pandas<3.0.0,>=1.1.0
|
|
@@ -133,7 +131,7 @@ Description-Content-Type: text/markdown
|
|
|
133
131
|
|Consumer Confidence index| 44 |22|-|Monthly|date, country|No
|
|
134
132
|
|World economic indicators|191 |41|-|Monthly|date, country|No
|
|
135
133
|
|Markets data|-|17|-|Monthly|date, datetime|No
|
|
136
|
-
|World mobile & fixed broadband network coverage and
|
|
134
|
+
|World mobile & fixed broadband network coverage and performance |167|-|3|Monthly|country, postal/ZIP code|No
|
|
137
135
|
|World demographic data |90|-|2|Annual|country, postal/ZIP code|No
|
|
138
136
|
|World house prices |44|-|3|Annual|country, postal/ZIP code|No
|
|
139
137
|
|Public social media profile data |104|-|-|Monthly|date, email/HEM, phone |Yes
|
|
@@ -842,4 +840,4 @@ Some convenient ways to start contributing are:
|
|
|
842
840
|
- [More perks for registered users](https://profile.upgini.com)
|
|
843
841
|
|
|
844
842
|
<sup>😔 Found mistype or a bug in code snippet? Our bad! <a href="https://github.com/upgini/upgini/issues/new?assignees=&title=readme%2Fbug">
|
|
845
|
-
Please report it here</a></sup>
|
|
843
|
+
Please report it here</a></sup>
|
|
@@ -90,7 +90,7 @@
|
|
|
90
90
|
|Consumer Confidence index| 44 |22|-|Monthly|date, country|No
|
|
91
91
|
|World economic indicators|191 |41|-|Monthly|date, country|No
|
|
92
92
|
|Markets data|-|17|-|Monthly|date, datetime|No
|
|
93
|
-
|World mobile & fixed broadband network coverage and
|
|
93
|
+
|World mobile & fixed broadband network coverage and performance |167|-|3|Monthly|country, postal/ZIP code|No
|
|
94
94
|
|World demographic data |90|-|2|Annual|country, postal/ZIP code|No
|
|
95
95
|
|World house prices |44|-|3|Annual|country, postal/ZIP code|No
|
|
96
96
|
|Public social media profile data |104|-|-|Monthly|date, email/HEM, phone |Yes
|
|
@@ -799,4 +799,4 @@ Some convenient ways to start contributing are:
|
|
|
799
799
|
- [More perks for registered users](https://profile.upgini.com)
|
|
800
800
|
|
|
801
801
|
<sup>😔 Found mistype or a bug in code snippet? Our bad! <a href="https://github.com/upgini/upgini/issues/new?assignees=&title=readme%2Fbug">
|
|
802
|
-
Please report it here</a></sup>
|
|
802
|
+
Please report it here</a></sup>
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "1.1.300"
|
|
@@ -1,20 +1,6 @@
|
|
|
1
1
|
from typing import Dict
|
|
2
2
|
|
|
3
|
-
from upgini.autofe.binary import
|
|
4
|
-
Add,
|
|
5
|
-
Combine,
|
|
6
|
-
CombineThenFreq,
|
|
7
|
-
Distance,
|
|
8
|
-
Divide,
|
|
9
|
-
JaroWinklerSim1,
|
|
10
|
-
JaroWinklerSim2,
|
|
11
|
-
LevenshteinSim,
|
|
12
|
-
Max,
|
|
13
|
-
Min,
|
|
14
|
-
Multiply,
|
|
15
|
-
Sim,
|
|
16
|
-
Subtract,
|
|
17
|
-
)
|
|
3
|
+
from upgini.autofe.binary import Add, Divide, Max, Min, Multiply, Sim, Subtract
|
|
18
4
|
from upgini.autofe.date import (
|
|
19
5
|
DateDiff,
|
|
20
6
|
DateDiffType2,
|
|
@@ -23,9 +9,9 @@ from upgini.autofe.date import (
|
|
|
23
9
|
DatePercentile,
|
|
24
10
|
DatePercentileMethod2,
|
|
25
11
|
)
|
|
26
|
-
from upgini.autofe.groupby import GroupByThenAgg,
|
|
12
|
+
from upgini.autofe.groupby import GroupByThenAgg, GroupByThenRank
|
|
27
13
|
from upgini.autofe.operand import Operand
|
|
28
|
-
from upgini.autofe.unary import Abs,
|
|
14
|
+
from upgini.autofe.unary import Abs, Floor, Freq, Log, Residual, Norm, Sigmoid, Sqrt, Square
|
|
29
15
|
from upgini.autofe.vector import Mean, Sum
|
|
30
16
|
|
|
31
17
|
ALL_OPERANDS: Dict[str, Operand] = {
|
|
@@ -53,10 +39,10 @@ ALL_OPERANDS: Dict[str, Operand] = {
|
|
|
53
39
|
GroupByThenAgg(name="GroupByThenMedian", agg="median"),
|
|
54
40
|
GroupByThenAgg(name="GroupByThenStd", output_type="float", agg="std"),
|
|
55
41
|
GroupByThenRank(),
|
|
56
|
-
Combine
|
|
57
|
-
CombineThenFreq
|
|
58
|
-
GroupByThenNUnique
|
|
59
|
-
GroupByThenFreq
|
|
42
|
+
Operand(name="Combine", has_symmetry_importance=True, output_type="object", is_categorical=True),
|
|
43
|
+
Operand(name="CombineThenFreq", has_symmetry_importance=True, output_type="float"),
|
|
44
|
+
Operand(name="GroupByThenNUnique", output_type="int", is_vectorizable=True, is_grouping=True),
|
|
45
|
+
Operand(name="GroupByThenFreq", output_type="float", is_grouping=True),
|
|
60
46
|
Sim(),
|
|
61
47
|
DateDiff(),
|
|
62
48
|
DateDiffType2(),
|
|
@@ -73,11 +59,6 @@ ALL_OPERANDS: Dict[str, Operand] = {
|
|
|
73
59
|
DatePercentile(),
|
|
74
60
|
DatePercentileMethod2(),
|
|
75
61
|
Norm(),
|
|
76
|
-
JaroWinklerSim1(),
|
|
77
|
-
JaroWinklerSim2(),
|
|
78
|
-
LevenshteinSim(),
|
|
79
|
-
Distance(),
|
|
80
|
-
Embeddings(),
|
|
81
62
|
]
|
|
82
63
|
}
|
|
83
64
|
|
|
@@ -1,11 +1,7 @@
|
|
|
1
|
-
import abc
|
|
2
|
-
from typing import Optional
|
|
3
|
-
import Levenshtein
|
|
4
1
|
import numpy as np
|
|
5
2
|
import pandas as pd
|
|
6
3
|
from numpy import dot
|
|
7
4
|
from numpy.linalg import norm
|
|
8
|
-
from jarowinkler import jarowinkler_similarity
|
|
9
5
|
|
|
10
6
|
from upgini.autofe.operand import PandasOperand, VectorizableMixin
|
|
11
7
|
|
|
@@ -134,27 +130,7 @@ class CombineThenFreq(PandasOperand):
|
|
|
134
130
|
self._loc(temp, value_counts)
|
|
135
131
|
|
|
136
132
|
|
|
137
|
-
class
|
|
138
|
-
name = "dist"
|
|
139
|
-
is_binary = True
|
|
140
|
-
output_type = "float"
|
|
141
|
-
is_symmetrical = True
|
|
142
|
-
has_symmetry_importance = True
|
|
143
|
-
|
|
144
|
-
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
145
|
-
return pd.Series(
|
|
146
|
-
1 - self.__dot(left, right) / (self.__dot(left, left) * self.__dot(right, right)), index=left.index
|
|
147
|
-
)
|
|
148
|
-
|
|
149
|
-
# row-wise dot product
|
|
150
|
-
def __dot(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
151
|
-
res = (left.dropna() * right.dropna()).apply(np.sum)
|
|
152
|
-
res = res.reindex(left.index.union(right.index))
|
|
153
|
-
return res
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
# Left for backward compatibility
|
|
157
|
-
class Sim(Distance):
|
|
133
|
+
class Sim(PandasOperand):
|
|
158
134
|
name = "sim"
|
|
159
135
|
is_binary = True
|
|
160
136
|
output_type = "float"
|
|
@@ -162,71 +138,4 @@ class Sim(Distance):
|
|
|
162
138
|
has_symmetry_importance = True
|
|
163
139
|
|
|
164
140
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
165
|
-
return
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
class StringSim(PandasOperand, abc.ABC):
|
|
169
|
-
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
170
|
-
sims = []
|
|
171
|
-
for i in left.index:
|
|
172
|
-
left_i = self._prepare_value(left.get(i))
|
|
173
|
-
right_i = self._prepare_value(right.get(i))
|
|
174
|
-
if left_i is not None and right_i is not None:
|
|
175
|
-
sims.append(self._similarity(left_i, right_i))
|
|
176
|
-
else:
|
|
177
|
-
sims.append(None)
|
|
178
|
-
|
|
179
|
-
return pd.Series(sims, index=left.index)
|
|
180
|
-
|
|
181
|
-
@abc.abstractmethod
|
|
182
|
-
def _prepare_value(self, value: Optional[str]) -> Optional[str]:
|
|
183
|
-
pass
|
|
184
|
-
|
|
185
|
-
@abc.abstractmethod
|
|
186
|
-
def _similarity(self, left: str, right: str) -> float:
|
|
187
|
-
pass
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
class JaroWinklerSim1(StringSim):
|
|
191
|
-
name = "sim_jw1"
|
|
192
|
-
is_binary = True
|
|
193
|
-
input_type = "string"
|
|
194
|
-
output_type = "float"
|
|
195
|
-
is_symmetrical = True
|
|
196
|
-
has_symmetry_importance = True
|
|
197
|
-
|
|
198
|
-
def _prepare_value(self, value: Optional[str]) -> Optional[str]:
|
|
199
|
-
return value
|
|
200
|
-
|
|
201
|
-
def _similarity(self, left: str, right: str) -> float:
|
|
202
|
-
return jarowinkler_similarity(left, right)
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
class JaroWinklerSim2(StringSim):
|
|
206
|
-
name = "sim_jw2"
|
|
207
|
-
is_binary = True
|
|
208
|
-
input_type = "string"
|
|
209
|
-
output_type = "float"
|
|
210
|
-
is_symmetrical = True
|
|
211
|
-
has_symmetry_importance = True
|
|
212
|
-
|
|
213
|
-
def _prepare_value(self, value: Optional[str]) -> Optional[str]:
|
|
214
|
-
return value[::-1] if value is not None else None
|
|
215
|
-
|
|
216
|
-
def _similarity(self, left: str, right: str) -> float:
|
|
217
|
-
return jarowinkler_similarity(left, right)
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
class LevenshteinSim(StringSim):
|
|
221
|
-
name = "sim_lv"
|
|
222
|
-
is_binary = True
|
|
223
|
-
input_type = "string"
|
|
224
|
-
output_type = "float"
|
|
225
|
-
is_symmetrical = True
|
|
226
|
-
has_symmetry_importance = True
|
|
227
|
-
|
|
228
|
-
def _prepare_value(self, value: Optional[str]) -> Optional[str]:
|
|
229
|
-
return value
|
|
230
|
-
|
|
231
|
-
def _similarity(self, left: str, right: str) -> float:
|
|
232
|
-
return 1 - Levenshtein.distance(left, right) / max(len(left), len(right))
|
|
141
|
+
return dot(left, right) / (norm(left) * norm(right))
|
|
@@ -43,8 +43,6 @@ class DateDiff(PandasOperand, DateDiffMixin):
|
|
|
43
43
|
is_binary = True
|
|
44
44
|
has_symmetry_importance = True
|
|
45
45
|
|
|
46
|
-
replace_negative: bool = False
|
|
47
|
-
|
|
48
46
|
def get_params(self) -> Dict[str, Optional[str]]:
|
|
49
47
|
res = super().get_params()
|
|
50
48
|
res.update(
|
|
@@ -52,7 +50,6 @@ class DateDiff(PandasOperand, DateDiffMixin):
|
|
|
52
50
|
"diff_unit": self.diff_unit,
|
|
53
51
|
"left_unit": self.left_unit,
|
|
54
52
|
"right_unit": self.right_unit,
|
|
55
|
-
"replace_negative": self.replace_negative,
|
|
56
53
|
}
|
|
57
54
|
)
|
|
58
55
|
return res
|
|
@@ -64,8 +61,7 @@ class DateDiff(PandasOperand, DateDiffMixin):
|
|
|
64
61
|
return self.__replace_negative(diff)
|
|
65
62
|
|
|
66
63
|
def __replace_negative(self, x: Union[pd.DataFrame, pd.Series]):
|
|
67
|
-
|
|
68
|
-
x[x < 0] = None
|
|
64
|
+
x[x < 0] = None
|
|
69
65
|
return x
|
|
70
66
|
|
|
71
67
|
|
|
@@ -105,19 +101,13 @@ _ext_aggregations = {"nunique": (lambda x: len(np.unique(x)), 0), "count": (len,
|
|
|
105
101
|
class DateListDiff(PandasOperand, DateDiffMixin):
|
|
106
102
|
is_binary = True
|
|
107
103
|
has_symmetry_importance = True
|
|
108
|
-
|
|
109
104
|
aggregation: str
|
|
110
|
-
replace_negative: bool = False
|
|
111
105
|
|
|
112
106
|
def get_params(self) -> Dict[str, Optional[str]]:
|
|
113
107
|
res = super().get_params()
|
|
114
108
|
res.update(
|
|
115
109
|
{
|
|
116
110
|
"aggregation": self.aggregation,
|
|
117
|
-
"diff_unit": self.diff_unit,
|
|
118
|
-
"left_unit": self.left_unit,
|
|
119
|
-
"right_unit": self.right_unit,
|
|
120
|
-
"replace_negative": self.replace_negative,
|
|
121
111
|
}
|
|
122
112
|
)
|
|
123
113
|
return res
|
|
@@ -135,7 +125,7 @@ class DateListDiff(PandasOperand, DateDiffMixin):
|
|
|
135
125
|
|
|
136
126
|
def _diff(self, x: TimedeltaArray):
|
|
137
127
|
x = self._convert_diff_to_unit(x)
|
|
138
|
-
return x[x > 0]
|
|
128
|
+
return x[x > 0]
|
|
139
129
|
|
|
140
130
|
def _agg(self, x):
|
|
141
131
|
method = getattr(np, self.aggregation, None)
|
|
@@ -167,10 +157,7 @@ class DateListDiffBounded(DateListDiff):
|
|
|
167
157
|
super().__init__(**data)
|
|
168
158
|
|
|
169
159
|
def _agg(self, x):
|
|
170
|
-
x = x[
|
|
171
|
-
(x >= (self.lower_bound if self.lower_bound is not None else -np.inf))
|
|
172
|
-
& (x < (self.upper_bound if self.upper_bound is not None else np.inf))
|
|
173
|
-
]
|
|
160
|
+
x = x[(x >= (self.lower_bound or -np.inf)) & (x < (self.upper_bound or np.inf))]
|
|
174
161
|
return super()._agg(x)
|
|
175
162
|
|
|
176
163
|
|
|
@@ -138,17 +138,15 @@ class Feature:
|
|
|
138
138
|
if self.cached_display_name is not None and cache:
|
|
139
139
|
return self.cached_display_name
|
|
140
140
|
|
|
141
|
-
should_stack_op = not isinstance(self.children[0], Column) if self.op.is_unary else False
|
|
142
|
-
prev_name = [self.children[0].get_op_display_name()] if should_stack_op else []
|
|
143
|
-
|
|
144
141
|
if self.alias:
|
|
145
142
|
components = ["f_autofe", self.alias]
|
|
146
|
-
elif shorten and
|
|
147
|
-
components = ["f_autofe"
|
|
143
|
+
elif shorten and not self.op.is_unary:
|
|
144
|
+
components = ["f_autofe", self.get_op_display_name()]
|
|
148
145
|
else:
|
|
149
|
-
components = (
|
|
150
|
-
|
|
151
|
-
|
|
146
|
+
components = ["f_" + "_f_".join(self.get_columns(**kwargs))] + [
|
|
147
|
+
"autofe",
|
|
148
|
+
self.get_op_display_name(),
|
|
149
|
+
]
|
|
152
150
|
components.extend([str(self.display_index)] if self.display_index is not None else [])
|
|
153
151
|
display_name = "_".join(components)
|
|
154
152
|
|
|
@@ -323,10 +321,10 @@ class FeatureGroup:
|
|
|
323
321
|
lower_order_names = [ch.get_display_name() for ch in lower_order_children]
|
|
324
322
|
if any(isinstance(f, Feature) for f in lower_order_children):
|
|
325
323
|
child_data = pd.concat(
|
|
326
|
-
[data[main_column
|
|
324
|
+
[data[main_column]] + [ch.calculate(data) for ch in lower_order_children],
|
|
327
325
|
axis=1,
|
|
328
326
|
)
|
|
329
|
-
child_data.columns =
|
|
327
|
+
child_data.columns = [main_column] + lower_order_names
|
|
330
328
|
else:
|
|
331
329
|
child_data = data[columns]
|
|
332
330
|
|
|
@@ -125,10 +125,3 @@ class Norm(PandasOperand):
|
|
|
125
125
|
normalized_data = pd.Series(normalized_data[:, 0], index=data_dropna.index, name=data.name)
|
|
126
126
|
normalized_data = normalized_data.reindex(data.index)
|
|
127
127
|
return normalized_data
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
class Embeddings(PandasOperand):
|
|
131
|
-
name = "emb"
|
|
132
|
-
is_unary = True
|
|
133
|
-
input_type = "string"
|
|
134
|
-
output_type = "vector"
|
|
@@ -23,7 +23,9 @@ from pandas.api.types import (
|
|
|
23
23
|
from upgini.errors import ValidationError
|
|
24
24
|
from upgini.http import ProgressStage, SearchProgress, _RestClient
|
|
25
25
|
from upgini.metadata import (
|
|
26
|
+
ENTITY_SYSTEM_RECORD_ID,
|
|
26
27
|
EVAL_SET_INDEX,
|
|
28
|
+
SEARCH_KEY_UNNEST,
|
|
27
29
|
SYSTEM_COLUMNS,
|
|
28
30
|
SYSTEM_RECORD_ID,
|
|
29
31
|
TARGET,
|
|
@@ -79,6 +81,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
79
81
|
path: Optional[str] = None,
|
|
80
82
|
meaning_types: Optional[Dict[str, FileColumnMeaningType]] = None,
|
|
81
83
|
search_keys: Optional[List[Tuple[str, ...]]] = None,
|
|
84
|
+
unnest_search_keys: Optional[Dict[str, str]] = None,
|
|
82
85
|
model_task_type: Optional[ModelTaskType] = None,
|
|
83
86
|
random_state: Optional[int] = None,
|
|
84
87
|
rest_client: Optional[_RestClient] = None,
|
|
@@ -113,6 +116,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
113
116
|
self.description = description
|
|
114
117
|
self.meaning_types = meaning_types
|
|
115
118
|
self.search_keys = search_keys
|
|
119
|
+
self.unnest_search_keys = unnest_search_keys
|
|
116
120
|
self.ignore_columns = []
|
|
117
121
|
self.hierarchical_group_keys = []
|
|
118
122
|
self.hierarchical_subgroup_keys = []
|
|
@@ -172,7 +176,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
172
176
|
new_columns = []
|
|
173
177
|
dup_counter = 0
|
|
174
178
|
for column in self.data.columns:
|
|
175
|
-
if column in [TARGET, EVAL_SET_INDEX, SYSTEM_RECORD_ID]:
|
|
179
|
+
if column in [TARGET, EVAL_SET_INDEX, SYSTEM_RECORD_ID, ENTITY_SYSTEM_RECORD_ID, SEARCH_KEY_UNNEST]:
|
|
176
180
|
self.columns_renaming[column] = column
|
|
177
181
|
new_columns.append(column)
|
|
178
182
|
continue
|
|
@@ -353,7 +357,9 @@ class Dataset: # (pd.DataFrame):
|
|
|
353
357
|
|
|
354
358
|
if is_string_dtype(self.data[postal_code]) or is_object_dtype(self.data[postal_code]):
|
|
355
359
|
try:
|
|
356
|
-
self.data[postal_code] =
|
|
360
|
+
self.data[postal_code] = (
|
|
361
|
+
self.data[postal_code].astype("string").astype("Float64").astype("Int64").astype("string")
|
|
362
|
+
)
|
|
357
363
|
except Exception:
|
|
358
364
|
pass
|
|
359
365
|
elif is_float_dtype(self.data[postal_code]):
|
|
@@ -803,6 +809,9 @@ class Dataset: # (pd.DataFrame):
|
|
|
803
809
|
meaningType=meaning_type,
|
|
804
810
|
minMaxValues=min_max_values,
|
|
805
811
|
)
|
|
812
|
+
if self.unnest_search_keys and column_meta.originalName in self.unnest_search_keys:
|
|
813
|
+
column_meta.isUnnest = True
|
|
814
|
+
column_meta.unnestKeyNames = self.unnest_search_keys[column_meta.originalName]
|
|
806
815
|
|
|
807
816
|
columns.append(column_meta)
|
|
808
817
|
|