upgini 1.1.316a4__tar.gz → 1.1.317__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- {upgini-1.1.316a4 → upgini-1.1.317}/PKG-INFO +3 -3
- {upgini-1.1.316a4 → upgini-1.1.317}/pyproject.toml +5 -6
- upgini-1.1.317/src/upgini/__about__.py +1 -0
- {upgini-1.1.316a4 → upgini-1.1.317}/src/upgini/autofe/binary.py +75 -72
- {upgini-1.1.316a4 → upgini-1.1.317}/src/upgini/autofe/date.py +22 -21
- {upgini-1.1.316a4 → upgini-1.1.317}/src/upgini/autofe/groupby.py +22 -22
- {upgini-1.1.316a4 → upgini-1.1.317}/src/upgini/autofe/operand.py +4 -4
- {upgini-1.1.316a4 → upgini-1.1.317}/src/upgini/autofe/unary.py +46 -47
- {upgini-1.1.316a4 → upgini-1.1.317}/src/upgini/autofe/vector.py +8 -8
- {upgini-1.1.316a4 → upgini-1.1.317}/src/upgini/features_enricher.py +2 -3
- {upgini-1.1.316a4 → upgini-1.1.317}/src/upgini/http.py +32 -32
- {upgini-1.1.316a4 → upgini-1.1.317}/src/upgini/lazy_import.py +1 -14
- {upgini-1.1.316a4 → upgini-1.1.317}/src/upgini/metadata.py +57 -57
- {upgini-1.1.316a4 → upgini-1.1.317}/src/upgini/normalizer/normalize_utils.py +2 -1
- {upgini-1.1.316a4 → upgini-1.1.317}/src/upgini/utils/datetime_utils.py +5 -5
- {upgini-1.1.316a4 → upgini-1.1.317}/src/upgini/utils/phone_utils.py +7 -5
- {upgini-1.1.316a4 → upgini-1.1.317}/src/upgini/utils/target_utils.py +1 -4
- upgini-1.1.316a4/src/upgini/__about__.py +0 -1
- {upgini-1.1.316a4 → upgini-1.1.317}/.gitignore +0 -0
- {upgini-1.1.316a4 → upgini-1.1.317}/LICENSE +0 -0
- {upgini-1.1.316a4 → upgini-1.1.317}/README.md +0 -0
- {upgini-1.1.316a4 → upgini-1.1.317}/src/upgini/__init__.py +0 -0
- {upgini-1.1.316a4 → upgini-1.1.317}/src/upgini/ads.py +0 -0
- {upgini-1.1.316a4 → upgini-1.1.317}/src/upgini/ads_management/__init__.py +0 -0
- {upgini-1.1.316a4 → upgini-1.1.317}/src/upgini/ads_management/ads_manager.py +0 -0
- {upgini-1.1.316a4 → upgini-1.1.317}/src/upgini/autofe/__init__.py +0 -0
- {upgini-1.1.316a4 → upgini-1.1.317}/src/upgini/autofe/all_operands.py +0 -0
- {upgini-1.1.316a4 → upgini-1.1.317}/src/upgini/autofe/feature.py +0 -0
- {upgini-1.1.316a4 → upgini-1.1.317}/src/upgini/data_source/__init__.py +0 -0
- {upgini-1.1.316a4 → upgini-1.1.317}/src/upgini/data_source/data_source_publisher.py +0 -0
- {upgini-1.1.316a4 → upgini-1.1.317}/src/upgini/dataset.py +0 -0
- {upgini-1.1.316a4 → upgini-1.1.317}/src/upgini/errors.py +0 -0
- {upgini-1.1.316a4 → upgini-1.1.317}/src/upgini/mdc/__init__.py +0 -0
- {upgini-1.1.316a4 → upgini-1.1.317}/src/upgini/mdc/context.py +0 -0
- {upgini-1.1.316a4 → upgini-1.1.317}/src/upgini/metrics.py +0 -0
- {upgini-1.1.316a4 → upgini-1.1.317}/src/upgini/normalizer/__init__.py +0 -0
- {upgini-1.1.316a4 → upgini-1.1.317}/src/upgini/resource_bundle/__init__.py +0 -0
- {upgini-1.1.316a4 → upgini-1.1.317}/src/upgini/resource_bundle/exceptions.py +0 -0
- {upgini-1.1.316a4 → upgini-1.1.317}/src/upgini/resource_bundle/strings.properties +0 -0
- {upgini-1.1.316a4 → upgini-1.1.317}/src/upgini/resource_bundle/strings_widget.properties +0 -0
- {upgini-1.1.316a4 → upgini-1.1.317}/src/upgini/sampler/__init__.py +0 -0
- {upgini-1.1.316a4 → upgini-1.1.317}/src/upgini/sampler/base.py +0 -0
- {upgini-1.1.316a4 → upgini-1.1.317}/src/upgini/sampler/random_under_sampler.py +0 -0
- {upgini-1.1.316a4 → upgini-1.1.317}/src/upgini/sampler/utils.py +0 -0
- {upgini-1.1.316a4 → upgini-1.1.317}/src/upgini/search_task.py +0 -0
- {upgini-1.1.316a4 → upgini-1.1.317}/src/upgini/spinner.py +0 -0
- {upgini-1.1.316a4 → upgini-1.1.317}/src/upgini/utils/__init__.py +0 -0
- {upgini-1.1.316a4 → upgini-1.1.317}/src/upgini/utils/base_search_key_detector.py +0 -0
- {upgini-1.1.316a4 → upgini-1.1.317}/src/upgini/utils/blocked_time_series.py +0 -0
- {upgini-1.1.316a4 → upgini-1.1.317}/src/upgini/utils/country_utils.py +0 -0
- {upgini-1.1.316a4 → upgini-1.1.317}/src/upgini/utils/custom_loss_utils.py +0 -0
- {upgini-1.1.316a4 → upgini-1.1.317}/src/upgini/utils/cv_utils.py +0 -0
- {upgini-1.1.316a4 → upgini-1.1.317}/src/upgini/utils/deduplicate_utils.py +0 -0
- {upgini-1.1.316a4 → upgini-1.1.317}/src/upgini/utils/display_utils.py +0 -0
- {upgini-1.1.316a4 → upgini-1.1.317}/src/upgini/utils/email_utils.py +0 -0
- {upgini-1.1.316a4 → upgini-1.1.317}/src/upgini/utils/fallback_progress_bar.py +0 -0
- {upgini-1.1.316a4 → upgini-1.1.317}/src/upgini/utils/features_validator.py +0 -0
- {upgini-1.1.316a4 → upgini-1.1.317}/src/upgini/utils/format.py +0 -0
- {upgini-1.1.316a4 → upgini-1.1.317}/src/upgini/utils/ip_utils.py +0 -0
- {upgini-1.1.316a4 → upgini-1.1.317}/src/upgini/utils/postal_code_utils.py +0 -0
- {upgini-1.1.316a4 → upgini-1.1.317}/src/upgini/utils/progress_bar.py +0 -0
- {upgini-1.1.316a4 → upgini-1.1.317}/src/upgini/utils/sklearn_ext.py +0 -0
- {upgini-1.1.316a4 → upgini-1.1.317}/src/upgini/utils/track_info.py +0 -0
- {upgini-1.1.316a4 → upgini-1.1.317}/src/upgini/utils/warning_counter.py +0 -0
- {upgini-1.1.316a4 → upgini-1.1.317}/src/upgini/version_validator.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: upgini
|
|
3
|
-
Version: 1.1.
|
|
3
|
+
Version: 1.1.317
|
|
4
4
|
Summary: Intelligent data search & enrichment for Machine Learning
|
|
5
5
|
Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
|
|
6
6
|
Project-URL: Homepage, https://upgini.com/
|
|
@@ -29,9 +29,9 @@ Requires-Dist: ipywidgets>=8.1.0
|
|
|
29
29
|
Requires-Dist: jarowinkler>=2.0.0
|
|
30
30
|
Requires-Dist: levenshtein>=0.25.1
|
|
31
31
|
Requires-Dist: lightgbm>=3.3.2
|
|
32
|
-
Requires-Dist: numpy
|
|
32
|
+
Requires-Dist: numpy>=1.19.0
|
|
33
33
|
Requires-Dist: pandas<3.0.0,>=1.1.0
|
|
34
|
-
Requires-Dist: pydantic<
|
|
34
|
+
Requires-Dist: pydantic<2.0.0,>=1.8.2
|
|
35
35
|
Requires-Dist: pyjwt>=2.8.0
|
|
36
36
|
Requires-Dist: python-bidi==0.4.2
|
|
37
37
|
Requires-Dist: python-dateutil>=2.8.0
|
|
@@ -39,9 +39,9 @@ dependencies = [
|
|
|
39
39
|
"fastparquet>=0.8.1",
|
|
40
40
|
"ipywidgets>=8.1.0",
|
|
41
41
|
"lightgbm>=3.3.2",
|
|
42
|
-
"numpy>=1.19.0
|
|
42
|
+
"numpy>=1.19.0",
|
|
43
43
|
"pandas>=1.1.0,<3.0.0",
|
|
44
|
-
"pydantic
|
|
44
|
+
"pydantic>=1.8.2,<2.0.0",
|
|
45
45
|
"pyjwt>=2.8.0",
|
|
46
46
|
"python-dateutil>=2.8.0",
|
|
47
47
|
"python-json-logger>=2.0.2",
|
|
@@ -79,7 +79,7 @@ python = "3.10"
|
|
|
79
79
|
cov = 'pytest --cov-report=term-missing --cov-config=pyproject.toml --cov=upgini --cov=tests'
|
|
80
80
|
format = "black {args}"
|
|
81
81
|
lint = "ruff check {args}"
|
|
82
|
-
|
|
82
|
+
test_binary = 'pytest -s -vv tests/test_binary_dataset.py'
|
|
83
83
|
|
|
84
84
|
[[tool.hatch.envs.test.matrix]]
|
|
85
85
|
python = ["3.8"]
|
|
@@ -103,7 +103,7 @@ dependencies = [
|
|
|
103
103
|
# "pytest-timeout",
|
|
104
104
|
"requests-mock",
|
|
105
105
|
"pytest-datafiles",
|
|
106
|
-
"pandas~={matrix:pandas}",
|
|
106
|
+
"pandas~={matrix:pandas}.0",
|
|
107
107
|
]
|
|
108
108
|
|
|
109
109
|
[tool.black]
|
|
@@ -115,5 +115,4 @@ profile = "black"
|
|
|
115
115
|
[tool.pytest.ini_options]
|
|
116
116
|
pythonpath = [
|
|
117
117
|
"./src"
|
|
118
|
-
]
|
|
119
|
-
addopts="-n 4"
|
|
118
|
+
]
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "1.1.317"
|
|
@@ -9,32 +9,32 @@ from upgini.autofe.operand import PandasOperand, VectorizableMixin
|
|
|
9
9
|
|
|
10
10
|
|
|
11
11
|
class Min(PandasOperand):
|
|
12
|
-
name
|
|
13
|
-
is_binary
|
|
14
|
-
is_symmetrical
|
|
15
|
-
has_symmetry_importance
|
|
12
|
+
name = "min"
|
|
13
|
+
is_binary = True
|
|
14
|
+
is_symmetrical = True
|
|
15
|
+
has_symmetry_importance = True
|
|
16
16
|
|
|
17
17
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
18
18
|
return np.minimum(left, right)
|
|
19
19
|
|
|
20
20
|
|
|
21
21
|
class Max(PandasOperand):
|
|
22
|
-
name
|
|
23
|
-
is_binary
|
|
24
|
-
is_symmetrical
|
|
25
|
-
has_symmetry_importance
|
|
22
|
+
name = "max"
|
|
23
|
+
is_binary = True
|
|
24
|
+
is_symmetrical = True
|
|
25
|
+
has_symmetry_importance = True
|
|
26
26
|
|
|
27
27
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
28
28
|
return np.maximum(left, right)
|
|
29
29
|
|
|
30
30
|
|
|
31
31
|
class Add(PandasOperand, VectorizableMixin):
|
|
32
|
-
name
|
|
33
|
-
alias
|
|
34
|
-
is_binary
|
|
35
|
-
is_symmetrical
|
|
36
|
-
has_symmetry_importance
|
|
37
|
-
is_vectorizable
|
|
32
|
+
name = "+"
|
|
33
|
+
alias = "add"
|
|
34
|
+
is_binary = True
|
|
35
|
+
is_symmetrical = True
|
|
36
|
+
has_symmetry_importance = True
|
|
37
|
+
is_vectorizable = True
|
|
38
38
|
|
|
39
39
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
40
40
|
return left + right
|
|
@@ -48,12 +48,12 @@ class Add(PandasOperand, VectorizableMixin):
|
|
|
48
48
|
|
|
49
49
|
|
|
50
50
|
class Subtract(PandasOperand, VectorizableMixin):
|
|
51
|
-
name
|
|
52
|
-
alias
|
|
53
|
-
is_binary
|
|
54
|
-
is_symmetrical
|
|
55
|
-
has_symmetry_importance
|
|
56
|
-
is_vectorizable
|
|
51
|
+
name = "-"
|
|
52
|
+
alias = "sub"
|
|
53
|
+
is_binary = True
|
|
54
|
+
is_symmetrical = True
|
|
55
|
+
has_symmetry_importance = True
|
|
56
|
+
is_vectorizable = True
|
|
57
57
|
|
|
58
58
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
59
59
|
return left - right
|
|
@@ -67,12 +67,12 @@ class Subtract(PandasOperand, VectorizableMixin):
|
|
|
67
67
|
|
|
68
68
|
|
|
69
69
|
class Multiply(PandasOperand, VectorizableMixin):
|
|
70
|
-
name
|
|
71
|
-
alias
|
|
72
|
-
is_binary
|
|
73
|
-
is_symmetrical
|
|
74
|
-
has_symmetry_importance
|
|
75
|
-
is_vectorizable
|
|
70
|
+
name = "*"
|
|
71
|
+
alias = "mul"
|
|
72
|
+
is_binary = True
|
|
73
|
+
is_symmetrical = True
|
|
74
|
+
has_symmetry_importance = True
|
|
75
|
+
is_vectorizable = True
|
|
76
76
|
|
|
77
77
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
78
78
|
return left * right
|
|
@@ -86,12 +86,12 @@ class Multiply(PandasOperand, VectorizableMixin):
|
|
|
86
86
|
|
|
87
87
|
|
|
88
88
|
class Divide(PandasOperand, VectorizableMixin):
|
|
89
|
-
name
|
|
90
|
-
alias
|
|
91
|
-
is_binary
|
|
92
|
-
has_symmetry_importance
|
|
93
|
-
is_vectorizable
|
|
94
|
-
output_type
|
|
89
|
+
name = "/"
|
|
90
|
+
alias = "div"
|
|
91
|
+
is_binary = True
|
|
92
|
+
has_symmetry_importance = True
|
|
93
|
+
is_vectorizable = True
|
|
94
|
+
output_type = "float"
|
|
95
95
|
|
|
96
96
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
97
97
|
return left / right.replace(0, np.nan)
|
|
@@ -105,10 +105,10 @@ class Divide(PandasOperand, VectorizableMixin):
|
|
|
105
105
|
|
|
106
106
|
|
|
107
107
|
class Combine(PandasOperand):
|
|
108
|
-
name
|
|
109
|
-
is_binary
|
|
110
|
-
has_symmetry_importance
|
|
111
|
-
output_type
|
|
108
|
+
name = "Combine"
|
|
109
|
+
is_binary = True
|
|
110
|
+
has_symmetry_importance = True
|
|
111
|
+
output_type = "object"
|
|
112
112
|
|
|
113
113
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
114
114
|
temp = left.astype(str) + "_" + right.astype(str)
|
|
@@ -117,13 +117,13 @@ class Combine(PandasOperand):
|
|
|
117
117
|
|
|
118
118
|
|
|
119
119
|
class CombineThenFreq(PandasOperand):
|
|
120
|
-
name
|
|
121
|
-
is_binary
|
|
122
|
-
is_symmetrical
|
|
123
|
-
has_symmetry_importance
|
|
124
|
-
output_type
|
|
125
|
-
is_distribution_dependent
|
|
126
|
-
input_type
|
|
120
|
+
name = "CombineThenFreq"
|
|
121
|
+
is_binary = True
|
|
122
|
+
is_symmetrical = True
|
|
123
|
+
has_symmetry_importance = True
|
|
124
|
+
output_type = "float"
|
|
125
|
+
is_distribution_dependent = True
|
|
126
|
+
input_type = "discrete"
|
|
127
127
|
|
|
128
128
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
129
129
|
temp = left.astype(str) + "_" + right.astype(str)
|
|
@@ -133,15 +133,15 @@ class CombineThenFreq(PandasOperand):
|
|
|
133
133
|
|
|
134
134
|
|
|
135
135
|
class Distance(PandasOperand):
|
|
136
|
-
name
|
|
137
|
-
is_binary
|
|
138
|
-
output_type
|
|
139
|
-
is_symmetrical
|
|
140
|
-
has_symmetry_importance
|
|
136
|
+
name = "dist"
|
|
137
|
+
is_binary = True
|
|
138
|
+
output_type = "float"
|
|
139
|
+
is_symmetrical = True
|
|
140
|
+
has_symmetry_importance = True
|
|
141
141
|
|
|
142
142
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
143
143
|
return pd.Series(
|
|
144
|
-
1 - self.__dot(left, right) / (self.
|
|
144
|
+
1 - self.__dot(left, right) / (self.__norm(left) * self.__norm(right)), index=left.index
|
|
145
145
|
)
|
|
146
146
|
|
|
147
147
|
# row-wise dot product
|
|
@@ -152,14 +152,17 @@ class Distance(PandasOperand):
|
|
|
152
152
|
res = res.reindex(left.index.union(right.index))
|
|
153
153
|
return res
|
|
154
154
|
|
|
155
|
+
def __norm(self, vector: pd.Series) -> pd.Series:
|
|
156
|
+
return np.sqrt(self.__dot(vector, vector))
|
|
157
|
+
|
|
155
158
|
|
|
156
159
|
# Left for backward compatibility
|
|
157
160
|
class Sim(Distance):
|
|
158
|
-
name
|
|
159
|
-
is_binary
|
|
160
|
-
output_type
|
|
161
|
-
is_symmetrical
|
|
162
|
-
has_symmetry_importance
|
|
161
|
+
name = "sim"
|
|
162
|
+
is_binary = True
|
|
163
|
+
output_type = "float"
|
|
164
|
+
is_symmetrical = True
|
|
165
|
+
has_symmetry_importance = True
|
|
163
166
|
|
|
164
167
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
165
168
|
return 1 - super().calculate_binary(left, right)
|
|
@@ -188,12 +191,12 @@ class StringSim(PandasOperand, abc.ABC):
|
|
|
188
191
|
|
|
189
192
|
|
|
190
193
|
class JaroWinklerSim1(StringSim):
|
|
191
|
-
name
|
|
192
|
-
is_binary
|
|
193
|
-
input_type
|
|
194
|
-
output_type
|
|
195
|
-
is_symmetrical
|
|
196
|
-
has_symmetry_importance
|
|
194
|
+
name = "sim_jw1"
|
|
195
|
+
is_binary = True
|
|
196
|
+
input_type = "string"
|
|
197
|
+
output_type = "float"
|
|
198
|
+
is_symmetrical = True
|
|
199
|
+
has_symmetry_importance = True
|
|
197
200
|
|
|
198
201
|
def _prepare_value(self, value: Optional[str]) -> Optional[str]:
|
|
199
202
|
return value
|
|
@@ -203,12 +206,12 @@ class JaroWinklerSim1(StringSim):
|
|
|
203
206
|
|
|
204
207
|
|
|
205
208
|
class JaroWinklerSim2(StringSim):
|
|
206
|
-
name
|
|
207
|
-
is_binary
|
|
208
|
-
input_type
|
|
209
|
-
output_type
|
|
210
|
-
is_symmetrical
|
|
211
|
-
has_symmetry_importance
|
|
209
|
+
name = "sim_jw2"
|
|
210
|
+
is_binary = True
|
|
211
|
+
input_type = "string"
|
|
212
|
+
output_type = "float"
|
|
213
|
+
is_symmetrical = True
|
|
214
|
+
has_symmetry_importance = True
|
|
212
215
|
|
|
213
216
|
def _prepare_value(self, value: Optional[str]) -> Optional[str]:
|
|
214
217
|
return value[::-1] if value is not None else None
|
|
@@ -218,12 +221,12 @@ class JaroWinklerSim2(StringSim):
|
|
|
218
221
|
|
|
219
222
|
|
|
220
223
|
class LevenshteinSim(StringSim):
|
|
221
|
-
name
|
|
222
|
-
is_binary
|
|
223
|
-
input_type
|
|
224
|
-
output_type
|
|
225
|
-
is_symmetrical
|
|
226
|
-
has_symmetry_importance
|
|
224
|
+
name = "sim_lv"
|
|
225
|
+
is_binary = True
|
|
226
|
+
input_type = "string"
|
|
227
|
+
output_type = "float"
|
|
228
|
+
is_symmetrical = True
|
|
229
|
+
has_symmetry_importance = True
|
|
227
230
|
|
|
228
231
|
def _prepare_value(self, value: Optional[str]) -> Optional[str]:
|
|
229
232
|
return value
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import abc
|
|
2
|
+
import json
|
|
2
3
|
from typing import Any, Dict, List, Optional, Union
|
|
3
4
|
|
|
4
5
|
import numpy as np
|
|
@@ -38,10 +39,10 @@ class DateDiffMixin(BaseModel):
|
|
|
38
39
|
|
|
39
40
|
|
|
40
41
|
class DateDiff(PandasOperand, DateDiffMixin):
|
|
41
|
-
name
|
|
42
|
-
alias
|
|
43
|
-
is_binary
|
|
44
|
-
has_symmetry_importance
|
|
42
|
+
name = "date_diff"
|
|
43
|
+
alias = "date_diff_type1"
|
|
44
|
+
is_binary = True
|
|
45
|
+
has_symmetry_importance = True
|
|
45
46
|
|
|
46
47
|
replace_negative: bool = False
|
|
47
48
|
|
|
@@ -70,9 +71,9 @@ class DateDiff(PandasOperand, DateDiffMixin):
|
|
|
70
71
|
|
|
71
72
|
|
|
72
73
|
class DateDiffType2(PandasOperand, DateDiffMixin):
|
|
73
|
-
name
|
|
74
|
-
is_binary
|
|
75
|
-
has_symmetry_importance
|
|
74
|
+
name = "date_diff_type2"
|
|
75
|
+
is_binary = True
|
|
76
|
+
has_symmetry_importance = True
|
|
76
77
|
|
|
77
78
|
def get_params(self) -> Dict[str, Optional[str]]:
|
|
78
79
|
res = super().get_params()
|
|
@@ -104,8 +105,8 @@ _count_aggregations = ["nunique", "count"]
|
|
|
104
105
|
|
|
105
106
|
|
|
106
107
|
class DateListDiff(PandasOperand, DateDiffMixin):
|
|
107
|
-
is_binary
|
|
108
|
-
has_symmetry_importance
|
|
108
|
+
is_binary = True
|
|
109
|
+
has_symmetry_importance = True
|
|
109
110
|
|
|
110
111
|
aggregation: str
|
|
111
112
|
replace_negative: bool = False
|
|
@@ -165,8 +166,8 @@ class DateListDiff(PandasOperand, DateDiffMixin):
|
|
|
165
166
|
|
|
166
167
|
|
|
167
168
|
class DateListDiffBounded(DateListDiff):
|
|
168
|
-
lower_bound: Optional[int]
|
|
169
|
-
upper_bound: Optional[int]
|
|
169
|
+
lower_bound: Optional[int]
|
|
170
|
+
upper_bound: Optional[int]
|
|
170
171
|
|
|
171
172
|
def __init__(self, **data: Any) -> None:
|
|
172
173
|
if "name" not in data:
|
|
@@ -191,8 +192,8 @@ class DateListDiffBounded(DateListDiff):
|
|
|
191
192
|
|
|
192
193
|
|
|
193
194
|
class DatePercentileBase(PandasOperand, abc.ABC):
|
|
194
|
-
is_binary
|
|
195
|
-
output_type
|
|
195
|
+
is_binary = True
|
|
196
|
+
output_type = "float"
|
|
196
197
|
|
|
197
198
|
date_unit: Optional[str] = None
|
|
198
199
|
|
|
@@ -226,12 +227,12 @@ class DatePercentileBase(PandasOperand, abc.ABC):
|
|
|
226
227
|
|
|
227
228
|
|
|
228
229
|
class DatePercentile(DatePercentileBase):
|
|
229
|
-
name
|
|
230
|
-
alias
|
|
230
|
+
name = "date_per"
|
|
231
|
+
alias = "date_per_method1"
|
|
231
232
|
|
|
232
|
-
zero_month: Optional[int]
|
|
233
|
-
zero_year: Optional[int]
|
|
234
|
-
zero_bounds: Optional[List[float]]
|
|
233
|
+
zero_month: Optional[int]
|
|
234
|
+
zero_year: Optional[int]
|
|
235
|
+
zero_bounds: Optional[List[float]]
|
|
235
236
|
step: int = 30
|
|
236
237
|
|
|
237
238
|
def get_params(self) -> Dict[str, Optional[str]]:
|
|
@@ -246,12 +247,12 @@ class DatePercentile(DatePercentileBase):
|
|
|
246
247
|
)
|
|
247
248
|
return res
|
|
248
249
|
|
|
249
|
-
@validator("zero_bounds", pre=
|
|
250
|
+
@validator("zero_bounds", pre=True)
|
|
250
251
|
def validate_bounds(cls, value):
|
|
251
252
|
if value is None or isinstance(value, list):
|
|
252
253
|
return value
|
|
253
254
|
elif isinstance(value, str):
|
|
254
|
-
return
|
|
255
|
+
return json.loads(value)
|
|
255
256
|
|
|
256
257
|
def _get_bounds(self, date_col: pd.Series) -> pd.Series:
|
|
257
258
|
months = date_col.dt.month
|
|
@@ -264,7 +265,7 @@ class DatePercentile(DatePercentileBase):
|
|
|
264
265
|
|
|
265
266
|
|
|
266
267
|
class DatePercentileMethod2(DatePercentileBase):
|
|
267
|
-
name
|
|
268
|
+
name = "date_per_method2"
|
|
268
269
|
|
|
269
270
|
def _get_bounds(self, date_col: pd.Series) -> pd.Series:
|
|
270
271
|
pass
|
|
@@ -7,9 +7,9 @@ from upgini.autofe.operand import PandasOperand, VectorizableMixin
|
|
|
7
7
|
|
|
8
8
|
class GroupByThenAgg(PandasOperand, VectorizableMixin):
|
|
9
9
|
agg: Optional[str]
|
|
10
|
-
is_vectorizable
|
|
11
|
-
is_grouping
|
|
12
|
-
is_distribution_dependent
|
|
10
|
+
is_vectorizable = True
|
|
11
|
+
is_grouping = True
|
|
12
|
+
is_distribution_dependent = True
|
|
13
13
|
|
|
14
14
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
15
15
|
temp = left.groupby(right).agg(self.agg)
|
|
@@ -24,17 +24,17 @@ class GroupByThenAgg(PandasOperand, VectorizableMixin):
|
|
|
24
24
|
|
|
25
25
|
|
|
26
26
|
class GroupByThenMedian(GroupByThenAgg):
|
|
27
|
-
name
|
|
28
|
-
pandas_agg
|
|
29
|
-
is_distribution_dependent
|
|
27
|
+
name = "GroupByThenMedian"
|
|
28
|
+
pandas_agg = "median"
|
|
29
|
+
is_distribution_dependent = True
|
|
30
30
|
|
|
31
31
|
|
|
32
32
|
class GroupByThenRank(PandasOperand, VectorizableMixin):
|
|
33
|
-
name
|
|
34
|
-
is_vectorizable
|
|
35
|
-
is_grouping
|
|
36
|
-
output_type
|
|
37
|
-
is_distribution_dependent
|
|
33
|
+
name = "GroupByThenRank"
|
|
34
|
+
is_vectorizable = True
|
|
35
|
+
is_grouping = True
|
|
36
|
+
output_type = "float"
|
|
37
|
+
is_distribution_dependent = True
|
|
38
38
|
|
|
39
39
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
40
40
|
temp = pd.DataFrame(left[~right.isna()].groupby(right).rank(ascending=True, pct=True)).reset_index()
|
|
@@ -49,12 +49,12 @@ class GroupByThenRank(PandasOperand, VectorizableMixin):
|
|
|
49
49
|
|
|
50
50
|
|
|
51
51
|
class GroupByThenNUnique(PandasOperand, VectorizableMixin):
|
|
52
|
-
name
|
|
53
|
-
is_vectorizable
|
|
54
|
-
is_grouping
|
|
55
|
-
output_type
|
|
56
|
-
is_distribution_dependent
|
|
57
|
-
input_type
|
|
52
|
+
name = "GroupByThenNUnique"
|
|
53
|
+
is_vectorizable = True
|
|
54
|
+
is_grouping = True
|
|
55
|
+
output_type = "int"
|
|
56
|
+
is_distribution_dependent = True
|
|
57
|
+
input_type = "discrete"
|
|
58
58
|
|
|
59
59
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
60
60
|
nunique = left.groupby(right).nunique()
|
|
@@ -69,11 +69,11 @@ class GroupByThenNUnique(PandasOperand, VectorizableMixin):
|
|
|
69
69
|
|
|
70
70
|
|
|
71
71
|
class GroupByThenFreq(PandasOperand):
|
|
72
|
-
name
|
|
73
|
-
is_grouping
|
|
74
|
-
output_type
|
|
75
|
-
is_distribution_dependent
|
|
76
|
-
input_type
|
|
72
|
+
name = "GroupByThenFreq"
|
|
73
|
+
is_grouping = True
|
|
74
|
+
output_type = "float"
|
|
75
|
+
is_distribution_dependent = True
|
|
76
|
+
input_type = "discrete"
|
|
77
77
|
|
|
78
78
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
79
79
|
def _f(x):
|
|
@@ -8,19 +8,19 @@ from pydantic import BaseModel
|
|
|
8
8
|
|
|
9
9
|
class Operand(BaseModel):
|
|
10
10
|
name: str
|
|
11
|
-
alias: Optional[str]
|
|
11
|
+
alias: Optional[str]
|
|
12
12
|
is_unary: bool = False
|
|
13
13
|
is_symmetrical: bool = False
|
|
14
14
|
has_symmetry_importance: bool = False
|
|
15
|
-
input_type: Optional[str]
|
|
16
|
-
output_type: Optional[str]
|
|
15
|
+
input_type: Optional[str]
|
|
16
|
+
output_type: Optional[str]
|
|
17
17
|
is_categorical: bool = False
|
|
18
18
|
is_vectorizable: bool = False
|
|
19
19
|
is_grouping: bool = False
|
|
20
20
|
is_binary: bool = False
|
|
21
21
|
is_vector: bool = False
|
|
22
22
|
is_distribution_dependent: bool = False
|
|
23
|
-
params: Optional[Dict[str, str]]
|
|
23
|
+
params: Optional[Dict[str, str]]
|
|
24
24
|
|
|
25
25
|
def set_params(self, params: Dict[str, str]):
|
|
26
26
|
self.params = params
|
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
from typing import Optional
|
|
2
1
|
import numpy as np
|
|
3
2
|
import pandas as pd
|
|
4
3
|
from sklearn.preprocessing import Normalizer
|
|
@@ -7,10 +6,10 @@ from upgini.autofe.operand import PandasOperand, VectorizableMixin
|
|
|
7
6
|
|
|
8
7
|
|
|
9
8
|
class Abs(PandasOperand, VectorizableMixin):
|
|
10
|
-
name
|
|
11
|
-
is_unary
|
|
12
|
-
is_vectorizable
|
|
13
|
-
group_index
|
|
9
|
+
name = "abs"
|
|
10
|
+
is_unary = True
|
|
11
|
+
is_vectorizable = True
|
|
12
|
+
group_index = 0
|
|
14
13
|
|
|
15
14
|
def calculate_unary(self, data: pd.Series) -> pd.Series:
|
|
16
15
|
return data.abs()
|
|
@@ -20,11 +19,11 @@ class Abs(PandasOperand, VectorizableMixin):
|
|
|
20
19
|
|
|
21
20
|
|
|
22
21
|
class Log(PandasOperand, VectorizableMixin):
|
|
23
|
-
name
|
|
24
|
-
is_unary
|
|
25
|
-
is_vectorizable
|
|
26
|
-
output_type
|
|
27
|
-
group_index
|
|
22
|
+
name = "log"
|
|
23
|
+
is_unary = True
|
|
24
|
+
is_vectorizable = True
|
|
25
|
+
output_type = "float"
|
|
26
|
+
group_index = 0
|
|
28
27
|
|
|
29
28
|
def calculate_unary(self, data: pd.Series) -> pd.Series:
|
|
30
29
|
return self._round_value(np.log(np.abs(data.replace(0, np.nan))), 10)
|
|
@@ -34,11 +33,11 @@ class Log(PandasOperand, VectorizableMixin):
|
|
|
34
33
|
|
|
35
34
|
|
|
36
35
|
class Sqrt(PandasOperand, VectorizableMixin):
|
|
37
|
-
name
|
|
38
|
-
is_unary
|
|
39
|
-
is_vectorizable
|
|
40
|
-
output_type
|
|
41
|
-
group_index
|
|
36
|
+
name = "sqrt"
|
|
37
|
+
is_unary = True
|
|
38
|
+
is_vectorizable = True
|
|
39
|
+
output_type = "float"
|
|
40
|
+
group_index = 0
|
|
42
41
|
|
|
43
42
|
def calculate_unary(self, data: pd.Series) -> pd.Series:
|
|
44
43
|
return self._round_value(np.sqrt(np.abs(data)))
|
|
@@ -48,10 +47,10 @@ class Sqrt(PandasOperand, VectorizableMixin):
|
|
|
48
47
|
|
|
49
48
|
|
|
50
49
|
class Square(PandasOperand, VectorizableMixin):
|
|
51
|
-
name
|
|
52
|
-
is_unary
|
|
53
|
-
is_vectorizable
|
|
54
|
-
group_index
|
|
50
|
+
name = "square"
|
|
51
|
+
is_unary = True
|
|
52
|
+
is_vectorizable = True
|
|
53
|
+
group_index = 0
|
|
55
54
|
|
|
56
55
|
def calculate_unary(self, data: pd.Series) -> pd.Series:
|
|
57
56
|
return np.square(data)
|
|
@@ -61,11 +60,11 @@ class Square(PandasOperand, VectorizableMixin):
|
|
|
61
60
|
|
|
62
61
|
|
|
63
62
|
class Sigmoid(PandasOperand, VectorizableMixin):
|
|
64
|
-
name
|
|
65
|
-
is_unary
|
|
66
|
-
is_vectorizable
|
|
67
|
-
output_type
|
|
68
|
-
group_index
|
|
63
|
+
name = "sigmoid"
|
|
64
|
+
is_unary = True
|
|
65
|
+
is_vectorizable = True
|
|
66
|
+
output_type = "float"
|
|
67
|
+
group_index = 0
|
|
69
68
|
|
|
70
69
|
def calculate_unary(self, data: pd.Series) -> pd.Series:
|
|
71
70
|
return self._round_value(1 / (1 + np.exp(-data)))
|
|
@@ -75,12 +74,12 @@ class Sigmoid(PandasOperand, VectorizableMixin):
|
|
|
75
74
|
|
|
76
75
|
|
|
77
76
|
class Floor(PandasOperand, VectorizableMixin):
|
|
78
|
-
name
|
|
79
|
-
is_unary
|
|
80
|
-
is_vectorizable
|
|
81
|
-
output_type
|
|
82
|
-
input_type
|
|
83
|
-
group_index
|
|
77
|
+
name = "floor"
|
|
78
|
+
is_unary = True
|
|
79
|
+
is_vectorizable = True
|
|
80
|
+
output_type = "int"
|
|
81
|
+
input_type = "continuous"
|
|
82
|
+
group_index = 0
|
|
84
83
|
|
|
85
84
|
def calculate_unary(self, data: pd.Series) -> pd.Series:
|
|
86
85
|
return np.floor(data)
|
|
@@ -90,11 +89,11 @@ class Floor(PandasOperand, VectorizableMixin):
|
|
|
90
89
|
|
|
91
90
|
|
|
92
91
|
class Residual(PandasOperand, VectorizableMixin):
|
|
93
|
-
name
|
|
94
|
-
is_unary
|
|
95
|
-
is_vectorizable
|
|
96
|
-
input_type
|
|
97
|
-
group_index
|
|
92
|
+
name = "residual"
|
|
93
|
+
is_unary = True
|
|
94
|
+
is_vectorizable = True
|
|
95
|
+
input_type = "continuous"
|
|
96
|
+
group_index = 0
|
|
98
97
|
|
|
99
98
|
def calculate_unary(self, data: pd.Series) -> pd.Series:
|
|
100
99
|
return data - np.floor(data)
|
|
@@ -104,11 +103,11 @@ class Residual(PandasOperand, VectorizableMixin):
|
|
|
104
103
|
|
|
105
104
|
|
|
106
105
|
class Freq(PandasOperand):
|
|
107
|
-
name
|
|
108
|
-
is_unary
|
|
109
|
-
output_type
|
|
110
|
-
is_distribution_dependent
|
|
111
|
-
input_type
|
|
106
|
+
name = "freq"
|
|
107
|
+
is_unary = True
|
|
108
|
+
output_type = "float"
|
|
109
|
+
is_distribution_dependent = True
|
|
110
|
+
input_type = "discrete"
|
|
112
111
|
|
|
113
112
|
def calculate_unary(self, data: pd.Series) -> pd.Series:
|
|
114
113
|
value_counts = data.value_counts(normalize=True)
|
|
@@ -116,9 +115,9 @@ class Freq(PandasOperand):
|
|
|
116
115
|
|
|
117
116
|
|
|
118
117
|
class Norm(PandasOperand):
|
|
119
|
-
name
|
|
120
|
-
is_unary
|
|
121
|
-
output_type
|
|
118
|
+
name = "norm"
|
|
119
|
+
is_unary = True
|
|
120
|
+
output_type = "float"
|
|
122
121
|
|
|
123
122
|
def calculate_unary(self, data: pd.Series) -> pd.Series:
|
|
124
123
|
data_dropna = data.dropna()
|
|
@@ -132,7 +131,7 @@ class Norm(PandasOperand):
|
|
|
132
131
|
|
|
133
132
|
|
|
134
133
|
class Embeddings(PandasOperand):
|
|
135
|
-
name
|
|
136
|
-
is_unary
|
|
137
|
-
input_type
|
|
138
|
-
output_type
|
|
134
|
+
name = "emb"
|
|
135
|
+
is_unary = True
|
|
136
|
+
input_type = "string"
|
|
137
|
+
output_type = "vector"
|