upgini 1.1.316__tar.gz → 1.1.316a2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- {upgini-1.1.316 → upgini-1.1.316a2}/PKG-INFO +3 -3
- {upgini-1.1.316 → upgini-1.1.316a2}/pyproject.toml +6 -5
- upgini-1.1.316a2/src/upgini/__about__.py +1 -0
- {upgini-1.1.316 → upgini-1.1.316a2}/src/upgini/autofe/binary.py +72 -75
- {upgini-1.1.316 → upgini-1.1.316a2}/src/upgini/autofe/date.py +21 -21
- {upgini-1.1.316 → upgini-1.1.316a2}/src/upgini/autofe/feature.py +2 -2
- {upgini-1.1.316 → upgini-1.1.316a2}/src/upgini/autofe/groupby.py +22 -22
- {upgini-1.1.316 → upgini-1.1.316a2}/src/upgini/autofe/operand.py +4 -4
- {upgini-1.1.316 → upgini-1.1.316a2}/src/upgini/autofe/unary.py +47 -46
- {upgini-1.1.316 → upgini-1.1.316a2}/src/upgini/autofe/vector.py +8 -8
- {upgini-1.1.316 → upgini-1.1.316a2}/src/upgini/features_enricher.py +3 -2
- {upgini-1.1.316 → upgini-1.1.316a2}/src/upgini/http.py +32 -32
- {upgini-1.1.316 → upgini-1.1.316a2}/src/upgini/lazy_import.py +14 -1
- {upgini-1.1.316 → upgini-1.1.316a2}/src/upgini/metadata.py +57 -57
- {upgini-1.1.316 → upgini-1.1.316a2}/src/upgini/normalizer/normalize_utils.py +1 -2
- {upgini-1.1.316 → upgini-1.1.316a2}/src/upgini/utils/datetime_utils.py +5 -5
- {upgini-1.1.316 → upgini-1.1.316a2}/src/upgini/utils/phone_utils.py +5 -7
- {upgini-1.1.316 → upgini-1.1.316a2}/src/upgini/utils/target_utils.py +4 -1
- upgini-1.1.316/src/upgini/__about__.py +0 -1
- {upgini-1.1.316 → upgini-1.1.316a2}/.gitignore +0 -0
- {upgini-1.1.316 → upgini-1.1.316a2}/LICENSE +0 -0
- {upgini-1.1.316 → upgini-1.1.316a2}/README.md +0 -0
- {upgini-1.1.316 → upgini-1.1.316a2}/src/upgini/__init__.py +0 -0
- {upgini-1.1.316 → upgini-1.1.316a2}/src/upgini/ads.py +0 -0
- {upgini-1.1.316 → upgini-1.1.316a2}/src/upgini/ads_management/__init__.py +0 -0
- {upgini-1.1.316 → upgini-1.1.316a2}/src/upgini/ads_management/ads_manager.py +0 -0
- {upgini-1.1.316 → upgini-1.1.316a2}/src/upgini/autofe/__init__.py +0 -0
- {upgini-1.1.316 → upgini-1.1.316a2}/src/upgini/autofe/all_operands.py +0 -0
- {upgini-1.1.316 → upgini-1.1.316a2}/src/upgini/data_source/__init__.py +0 -0
- {upgini-1.1.316 → upgini-1.1.316a2}/src/upgini/data_source/data_source_publisher.py +0 -0
- {upgini-1.1.316 → upgini-1.1.316a2}/src/upgini/dataset.py +0 -0
- {upgini-1.1.316 → upgini-1.1.316a2}/src/upgini/errors.py +0 -0
- {upgini-1.1.316 → upgini-1.1.316a2}/src/upgini/mdc/__init__.py +0 -0
- {upgini-1.1.316 → upgini-1.1.316a2}/src/upgini/mdc/context.py +0 -0
- {upgini-1.1.316 → upgini-1.1.316a2}/src/upgini/metrics.py +0 -0
- {upgini-1.1.316 → upgini-1.1.316a2}/src/upgini/normalizer/__init__.py +0 -0
- {upgini-1.1.316 → upgini-1.1.316a2}/src/upgini/resource_bundle/__init__.py +0 -0
- {upgini-1.1.316 → upgini-1.1.316a2}/src/upgini/resource_bundle/exceptions.py +0 -0
- {upgini-1.1.316 → upgini-1.1.316a2}/src/upgini/resource_bundle/strings.properties +0 -0
- {upgini-1.1.316 → upgini-1.1.316a2}/src/upgini/resource_bundle/strings_widget.properties +0 -0
- {upgini-1.1.316 → upgini-1.1.316a2}/src/upgini/sampler/__init__.py +0 -0
- {upgini-1.1.316 → upgini-1.1.316a2}/src/upgini/sampler/base.py +0 -0
- {upgini-1.1.316 → upgini-1.1.316a2}/src/upgini/sampler/random_under_sampler.py +0 -0
- {upgini-1.1.316 → upgini-1.1.316a2}/src/upgini/sampler/utils.py +0 -0
- {upgini-1.1.316 → upgini-1.1.316a2}/src/upgini/search_task.py +0 -0
- {upgini-1.1.316 → upgini-1.1.316a2}/src/upgini/spinner.py +0 -0
- {upgini-1.1.316 → upgini-1.1.316a2}/src/upgini/utils/__init__.py +0 -0
- {upgini-1.1.316 → upgini-1.1.316a2}/src/upgini/utils/base_search_key_detector.py +0 -0
- {upgini-1.1.316 → upgini-1.1.316a2}/src/upgini/utils/blocked_time_series.py +0 -0
- {upgini-1.1.316 → upgini-1.1.316a2}/src/upgini/utils/country_utils.py +0 -0
- {upgini-1.1.316 → upgini-1.1.316a2}/src/upgini/utils/custom_loss_utils.py +0 -0
- {upgini-1.1.316 → upgini-1.1.316a2}/src/upgini/utils/cv_utils.py +0 -0
- {upgini-1.1.316 → upgini-1.1.316a2}/src/upgini/utils/deduplicate_utils.py +0 -0
- {upgini-1.1.316 → upgini-1.1.316a2}/src/upgini/utils/display_utils.py +0 -0
- {upgini-1.1.316 → upgini-1.1.316a2}/src/upgini/utils/email_utils.py +0 -0
- {upgini-1.1.316 → upgini-1.1.316a2}/src/upgini/utils/fallback_progress_bar.py +0 -0
- {upgini-1.1.316 → upgini-1.1.316a2}/src/upgini/utils/features_validator.py +0 -0
- {upgini-1.1.316 → upgini-1.1.316a2}/src/upgini/utils/format.py +0 -0
- {upgini-1.1.316 → upgini-1.1.316a2}/src/upgini/utils/ip_utils.py +0 -0
- {upgini-1.1.316 → upgini-1.1.316a2}/src/upgini/utils/postal_code_utils.py +0 -0
- {upgini-1.1.316 → upgini-1.1.316a2}/src/upgini/utils/progress_bar.py +0 -0
- {upgini-1.1.316 → upgini-1.1.316a2}/src/upgini/utils/sklearn_ext.py +0 -0
- {upgini-1.1.316 → upgini-1.1.316a2}/src/upgini/utils/track_info.py +0 -0
- {upgini-1.1.316 → upgini-1.1.316a2}/src/upgini/utils/warning_counter.py +0 -0
- {upgini-1.1.316 → upgini-1.1.316a2}/src/upgini/version_validator.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: upgini
|
|
3
|
-
Version: 1.1.
|
|
3
|
+
Version: 1.1.316a2
|
|
4
4
|
Summary: Intelligent data search & enrichment for Machine Learning
|
|
5
5
|
Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
|
|
6
6
|
Project-URL: Homepage, https://upgini.com/
|
|
@@ -29,9 +29,9 @@ Requires-Dist: ipywidgets>=8.1.0
|
|
|
29
29
|
Requires-Dist: jarowinkler>=2.0.0
|
|
30
30
|
Requires-Dist: levenshtein>=0.25.1
|
|
31
31
|
Requires-Dist: lightgbm>=3.3.2
|
|
32
|
-
Requires-Dist: numpy
|
|
32
|
+
Requires-Dist: numpy<=1.26.4,>=1.19.0
|
|
33
33
|
Requires-Dist: pandas<3.0.0,>=1.1.0
|
|
34
|
-
Requires-Dist: pydantic<
|
|
34
|
+
Requires-Dist: pydantic<3.0.0,>1.0.0
|
|
35
35
|
Requires-Dist: pyjwt>=2.8.0
|
|
36
36
|
Requires-Dist: python-bidi==0.4.2
|
|
37
37
|
Requires-Dist: python-dateutil>=2.8.0
|
|
@@ -39,9 +39,9 @@ dependencies = [
|
|
|
39
39
|
"fastparquet>=0.8.1",
|
|
40
40
|
"ipywidgets>=8.1.0",
|
|
41
41
|
"lightgbm>=3.3.2",
|
|
42
|
-
"numpy>=1.19.0",
|
|
42
|
+
"numpy>=1.19.0,<=1.26.4",
|
|
43
43
|
"pandas>=1.1.0,<3.0.0",
|
|
44
|
-
"pydantic
|
|
44
|
+
"pydantic>1.0.0,<3.0.0",
|
|
45
45
|
"pyjwt>=2.8.0",
|
|
46
46
|
"python-dateutil>=2.8.0",
|
|
47
47
|
"python-json-logger>=2.0.2",
|
|
@@ -79,7 +79,7 @@ python = "3.10"
|
|
|
79
79
|
cov = 'pytest --cov-report=term-missing --cov-config=pyproject.toml --cov=upgini --cov=tests'
|
|
80
80
|
format = "black {args}"
|
|
81
81
|
lint = "ruff check {args}"
|
|
82
|
-
|
|
82
|
+
test_all = 'pytest -s -vv tests'
|
|
83
83
|
|
|
84
84
|
[[tool.hatch.envs.test.matrix]]
|
|
85
85
|
python = ["3.8"]
|
|
@@ -103,7 +103,7 @@ dependencies = [
|
|
|
103
103
|
# "pytest-timeout",
|
|
104
104
|
"requests-mock",
|
|
105
105
|
"pytest-datafiles",
|
|
106
|
-
"pandas~={matrix:pandas}
|
|
106
|
+
"pandas~={matrix:pandas}",
|
|
107
107
|
]
|
|
108
108
|
|
|
109
109
|
[tool.black]
|
|
@@ -115,4 +115,5 @@ profile = "black"
|
|
|
115
115
|
[tool.pytest.ini_options]
|
|
116
116
|
pythonpath = [
|
|
117
117
|
"./src"
|
|
118
|
-
]
|
|
118
|
+
]
|
|
119
|
+
addopts="-n 4"
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "1.1.316a2"
|
|
@@ -9,32 +9,32 @@ from upgini.autofe.operand import PandasOperand, VectorizableMixin
|
|
|
9
9
|
|
|
10
10
|
|
|
11
11
|
class Min(PandasOperand):
|
|
12
|
-
name = "min"
|
|
13
|
-
is_binary = True
|
|
14
|
-
is_symmetrical = True
|
|
15
|
-
has_symmetry_importance = True
|
|
12
|
+
name: str = "min"
|
|
13
|
+
is_binary: bool = True
|
|
14
|
+
is_symmetrical: bool = True
|
|
15
|
+
has_symmetry_importance: bool = True
|
|
16
16
|
|
|
17
17
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
18
18
|
return np.minimum(left, right)
|
|
19
19
|
|
|
20
20
|
|
|
21
21
|
class Max(PandasOperand):
|
|
22
|
-
name = "max"
|
|
23
|
-
is_binary = True
|
|
24
|
-
is_symmetrical = True
|
|
25
|
-
has_symmetry_importance = True
|
|
22
|
+
name: str = "max"
|
|
23
|
+
is_binary: bool = True
|
|
24
|
+
is_symmetrical: bool = True
|
|
25
|
+
has_symmetry_importance: bool = True
|
|
26
26
|
|
|
27
27
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
28
28
|
return np.maximum(left, right)
|
|
29
29
|
|
|
30
30
|
|
|
31
31
|
class Add(PandasOperand, VectorizableMixin):
|
|
32
|
-
name = "+"
|
|
33
|
-
alias = "add"
|
|
34
|
-
is_binary = True
|
|
35
|
-
is_symmetrical = True
|
|
36
|
-
has_symmetry_importance = True
|
|
37
|
-
is_vectorizable = True
|
|
32
|
+
name: str = "+"
|
|
33
|
+
alias: str = "add"
|
|
34
|
+
is_binary: bool = True
|
|
35
|
+
is_symmetrical: bool = True
|
|
36
|
+
has_symmetry_importance: bool = True
|
|
37
|
+
is_vectorizable: bool = True
|
|
38
38
|
|
|
39
39
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
40
40
|
return left + right
|
|
@@ -48,12 +48,12 @@ class Add(PandasOperand, VectorizableMixin):
|
|
|
48
48
|
|
|
49
49
|
|
|
50
50
|
class Subtract(PandasOperand, VectorizableMixin):
|
|
51
|
-
name = "-"
|
|
52
|
-
alias = "sub"
|
|
53
|
-
is_binary = True
|
|
54
|
-
is_symmetrical = True
|
|
55
|
-
has_symmetry_importance = True
|
|
56
|
-
is_vectorizable = True
|
|
51
|
+
name: str = "-"
|
|
52
|
+
alias: str = "sub"
|
|
53
|
+
is_binary: bool = True
|
|
54
|
+
is_symmetrical: bool = True
|
|
55
|
+
has_symmetry_importance: bool = True
|
|
56
|
+
is_vectorizable: bool = True
|
|
57
57
|
|
|
58
58
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
59
59
|
return left - right
|
|
@@ -67,12 +67,12 @@ class Subtract(PandasOperand, VectorizableMixin):
|
|
|
67
67
|
|
|
68
68
|
|
|
69
69
|
class Multiply(PandasOperand, VectorizableMixin):
|
|
70
|
-
name = "*"
|
|
71
|
-
alias = "mul"
|
|
72
|
-
is_binary = True
|
|
73
|
-
is_symmetrical = True
|
|
74
|
-
has_symmetry_importance = True
|
|
75
|
-
is_vectorizable = True
|
|
70
|
+
name: str = "*"
|
|
71
|
+
alias: str = "mul"
|
|
72
|
+
is_binary: bool = True
|
|
73
|
+
is_symmetrical: bool = True
|
|
74
|
+
has_symmetry_importance: bool = True
|
|
75
|
+
is_vectorizable: bool = True
|
|
76
76
|
|
|
77
77
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
78
78
|
return left * right
|
|
@@ -86,12 +86,12 @@ class Multiply(PandasOperand, VectorizableMixin):
|
|
|
86
86
|
|
|
87
87
|
|
|
88
88
|
class Divide(PandasOperand, VectorizableMixin):
|
|
89
|
-
name = "/"
|
|
90
|
-
alias = "div"
|
|
91
|
-
is_binary = True
|
|
92
|
-
has_symmetry_importance = True
|
|
93
|
-
is_vectorizable = True
|
|
94
|
-
output_type = "float"
|
|
89
|
+
name: str = "/"
|
|
90
|
+
alias: str = "div"
|
|
91
|
+
is_binary: bool = True
|
|
92
|
+
has_symmetry_importance: bool = True
|
|
93
|
+
is_vectorizable: bool = True
|
|
94
|
+
output_type: Optional[str] = "float"
|
|
95
95
|
|
|
96
96
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
97
97
|
return left / right.replace(0, np.nan)
|
|
@@ -105,10 +105,10 @@ class Divide(PandasOperand, VectorizableMixin):
|
|
|
105
105
|
|
|
106
106
|
|
|
107
107
|
class Combine(PandasOperand):
|
|
108
|
-
name = "Combine"
|
|
109
|
-
is_binary = True
|
|
110
|
-
has_symmetry_importance = True
|
|
111
|
-
output_type = "object"
|
|
108
|
+
name: str = "Combine"
|
|
109
|
+
is_binary: bool = True
|
|
110
|
+
has_symmetry_importance: bool = True
|
|
111
|
+
output_type: Optional[str] = "object"
|
|
112
112
|
|
|
113
113
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
114
114
|
temp = left.astype(str) + "_" + right.astype(str)
|
|
@@ -117,13 +117,13 @@ class Combine(PandasOperand):
|
|
|
117
117
|
|
|
118
118
|
|
|
119
119
|
class CombineThenFreq(PandasOperand):
|
|
120
|
-
name = "CombineThenFreq"
|
|
121
|
-
is_binary = True
|
|
122
|
-
is_symmetrical = True
|
|
123
|
-
has_symmetry_importance = True
|
|
124
|
-
output_type = "float"
|
|
125
|
-
is_distribution_dependent = True
|
|
126
|
-
input_type = "discrete"
|
|
120
|
+
name: str = "CombineThenFreq"
|
|
121
|
+
is_binary: bool = True
|
|
122
|
+
is_symmetrical: bool = True
|
|
123
|
+
has_symmetry_importance: bool = True
|
|
124
|
+
output_type: Optional[str] = "float"
|
|
125
|
+
is_distribution_dependent: bool = True
|
|
126
|
+
input_type: Optional[str] = "discrete"
|
|
127
127
|
|
|
128
128
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
129
129
|
temp = left.astype(str) + "_" + right.astype(str)
|
|
@@ -133,15 +133,15 @@ class CombineThenFreq(PandasOperand):
|
|
|
133
133
|
|
|
134
134
|
|
|
135
135
|
class Distance(PandasOperand):
|
|
136
|
-
name = "dist"
|
|
137
|
-
is_binary = True
|
|
138
|
-
output_type = "float"
|
|
139
|
-
is_symmetrical = True
|
|
140
|
-
has_symmetry_importance = True
|
|
136
|
+
name: str = "dist"
|
|
137
|
+
is_binary: bool = True
|
|
138
|
+
output_type: Optional[str] = "float"
|
|
139
|
+
is_symmetrical: bool = True
|
|
140
|
+
has_symmetry_importance: bool = True
|
|
141
141
|
|
|
142
142
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
143
143
|
return pd.Series(
|
|
144
|
-
1 - self.__dot(left, right) / (self.
|
|
144
|
+
1 - self.__dot(left, right) / (self.__dot(left, left) * self.__dot(right, right)), index=left.index
|
|
145
145
|
)
|
|
146
146
|
|
|
147
147
|
# row-wise dot product
|
|
@@ -152,17 +152,14 @@ class Distance(PandasOperand):
|
|
|
152
152
|
res = res.reindex(left.index.union(right.index))
|
|
153
153
|
return res
|
|
154
154
|
|
|
155
|
-
def __norm(self, vector: pd.Series) -> pd.Series:
|
|
156
|
-
return np.sqrt(self.__dot(vector, vector))
|
|
157
|
-
|
|
158
155
|
|
|
159
156
|
# Left for backward compatibility
|
|
160
157
|
class Sim(Distance):
|
|
161
|
-
name = "sim"
|
|
162
|
-
is_binary = True
|
|
163
|
-
output_type = "float"
|
|
164
|
-
is_symmetrical = True
|
|
165
|
-
has_symmetry_importance = True
|
|
158
|
+
name: str = "sim"
|
|
159
|
+
is_binary: bool = True
|
|
160
|
+
output_type: Optional[str] = "float"
|
|
161
|
+
is_symmetrical: bool = True
|
|
162
|
+
has_symmetry_importance: bool = True
|
|
166
163
|
|
|
167
164
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
168
165
|
return 1 - super().calculate_binary(left, right)
|
|
@@ -191,12 +188,12 @@ class StringSim(PandasOperand, abc.ABC):
|
|
|
191
188
|
|
|
192
189
|
|
|
193
190
|
class JaroWinklerSim1(StringSim):
|
|
194
|
-
name = "sim_jw1"
|
|
195
|
-
is_binary = True
|
|
196
|
-
input_type = "string"
|
|
197
|
-
output_type = "float"
|
|
198
|
-
is_symmetrical = True
|
|
199
|
-
has_symmetry_importance = True
|
|
191
|
+
name: str = "sim_jw1"
|
|
192
|
+
is_binary: bool = True
|
|
193
|
+
input_type: Optional[str] = "string"
|
|
194
|
+
output_type: Optional[str] = "float"
|
|
195
|
+
is_symmetrical: bool = True
|
|
196
|
+
has_symmetry_importance: bool = True
|
|
200
197
|
|
|
201
198
|
def _prepare_value(self, value: Optional[str]) -> Optional[str]:
|
|
202
199
|
return value
|
|
@@ -206,12 +203,12 @@ class JaroWinklerSim1(StringSim):
|
|
|
206
203
|
|
|
207
204
|
|
|
208
205
|
class JaroWinklerSim2(StringSim):
|
|
209
|
-
name = "sim_jw2"
|
|
210
|
-
is_binary = True
|
|
211
|
-
input_type = "string"
|
|
212
|
-
output_type = "float"
|
|
213
|
-
is_symmetrical = True
|
|
214
|
-
has_symmetry_importance = True
|
|
206
|
+
name: str = "sim_jw2"
|
|
207
|
+
is_binary: bool = True
|
|
208
|
+
input_type: Optional[str] = "string"
|
|
209
|
+
output_type: Optional[str] = "float"
|
|
210
|
+
is_symmetrical: bool = True
|
|
211
|
+
has_symmetry_importance: bool = True
|
|
215
212
|
|
|
216
213
|
def _prepare_value(self, value: Optional[str]) -> Optional[str]:
|
|
217
214
|
return value[::-1] if value is not None else None
|
|
@@ -221,12 +218,12 @@ class JaroWinklerSim2(StringSim):
|
|
|
221
218
|
|
|
222
219
|
|
|
223
220
|
class LevenshteinSim(StringSim):
|
|
224
|
-
name = "sim_lv"
|
|
225
|
-
is_binary = True
|
|
226
|
-
input_type = "string"
|
|
227
|
-
output_type = "float"
|
|
228
|
-
is_symmetrical = True
|
|
229
|
-
has_symmetry_importance = True
|
|
221
|
+
name: str = "sim_lv"
|
|
222
|
+
is_binary: bool = True
|
|
223
|
+
input_type: Optional[str] = "string"
|
|
224
|
+
output_type: Optional[str] = "float"
|
|
225
|
+
is_symmetrical: bool = True
|
|
226
|
+
has_symmetry_importance: bool = True
|
|
230
227
|
|
|
231
228
|
def _prepare_value(self, value: Optional[str]) -> Optional[str]:
|
|
232
229
|
return value
|
|
@@ -4,7 +4,7 @@ from typing import Any, Dict, List, Optional, Union
|
|
|
4
4
|
import numpy as np
|
|
5
5
|
import pandas as pd
|
|
6
6
|
from pandas.core.arrays.timedeltas import TimedeltaArray
|
|
7
|
-
from pydantic import BaseModel,
|
|
7
|
+
from pydantic import BaseModel, field_validator
|
|
8
8
|
|
|
9
9
|
from upgini.autofe.operand import PandasOperand
|
|
10
10
|
|
|
@@ -38,10 +38,10 @@ class DateDiffMixin(BaseModel):
|
|
|
38
38
|
|
|
39
39
|
|
|
40
40
|
class DateDiff(PandasOperand, DateDiffMixin):
|
|
41
|
-
name = "date_diff"
|
|
42
|
-
alias = "date_diff_type1"
|
|
43
|
-
is_binary = True
|
|
44
|
-
has_symmetry_importance = True
|
|
41
|
+
name: str = "date_diff"
|
|
42
|
+
alias: Optional[str] = "date_diff_type1"
|
|
43
|
+
is_binary: bool = True
|
|
44
|
+
has_symmetry_importance: bool = True
|
|
45
45
|
|
|
46
46
|
replace_negative: bool = False
|
|
47
47
|
|
|
@@ -70,9 +70,9 @@ class DateDiff(PandasOperand, DateDiffMixin):
|
|
|
70
70
|
|
|
71
71
|
|
|
72
72
|
class DateDiffType2(PandasOperand, DateDiffMixin):
|
|
73
|
-
name = "date_diff_type2"
|
|
74
|
-
is_binary = True
|
|
75
|
-
has_symmetry_importance = True
|
|
73
|
+
name: str = "date_diff_type2"
|
|
74
|
+
is_binary: bool = True
|
|
75
|
+
has_symmetry_importance: bool = True
|
|
76
76
|
|
|
77
77
|
def get_params(self) -> Dict[str, Optional[str]]:
|
|
78
78
|
res = super().get_params()
|
|
@@ -104,8 +104,8 @@ _count_aggregations = ["nunique", "count"]
|
|
|
104
104
|
|
|
105
105
|
|
|
106
106
|
class DateListDiff(PandasOperand, DateDiffMixin):
|
|
107
|
-
is_binary = True
|
|
108
|
-
has_symmetry_importance = True
|
|
107
|
+
is_binary: bool = True
|
|
108
|
+
has_symmetry_importance: bool = True
|
|
109
109
|
|
|
110
110
|
aggregation: str
|
|
111
111
|
replace_negative: bool = False
|
|
@@ -165,8 +165,8 @@ class DateListDiff(PandasOperand, DateDiffMixin):
|
|
|
165
165
|
|
|
166
166
|
|
|
167
167
|
class DateListDiffBounded(DateListDiff):
|
|
168
|
-
lower_bound: Optional[int]
|
|
169
|
-
upper_bound: Optional[int]
|
|
168
|
+
lower_bound: Optional[int] = None
|
|
169
|
+
upper_bound: Optional[int] = None
|
|
170
170
|
|
|
171
171
|
def __init__(self, **data: Any) -> None:
|
|
172
172
|
if "name" not in data:
|
|
@@ -191,8 +191,8 @@ class DateListDiffBounded(DateListDiff):
|
|
|
191
191
|
|
|
192
192
|
|
|
193
193
|
class DatePercentileBase(PandasOperand, abc.ABC):
|
|
194
|
-
is_binary = True
|
|
195
|
-
output_type = "float"
|
|
194
|
+
is_binary: bool = True
|
|
195
|
+
output_type: Optional[str] = "float"
|
|
196
196
|
|
|
197
197
|
date_unit: Optional[str] = None
|
|
198
198
|
|
|
@@ -226,12 +226,12 @@ class DatePercentileBase(PandasOperand, abc.ABC):
|
|
|
226
226
|
|
|
227
227
|
|
|
228
228
|
class DatePercentile(DatePercentileBase):
|
|
229
|
-
name = "date_per"
|
|
230
|
-
alias = "date_per_method1"
|
|
229
|
+
name: str = "date_per"
|
|
230
|
+
alias: Optional[str] = "date_per_method1"
|
|
231
231
|
|
|
232
|
-
zero_month: Optional[int]
|
|
233
|
-
zero_year: Optional[int]
|
|
234
|
-
zero_bounds: Optional[List[float]]
|
|
232
|
+
zero_month: Optional[int] = None
|
|
233
|
+
zero_year: Optional[int] = None
|
|
234
|
+
zero_bounds: Optional[List[float]] = None
|
|
235
235
|
step: int = 30
|
|
236
236
|
|
|
237
237
|
def get_params(self) -> Dict[str, Optional[str]]:
|
|
@@ -246,7 +246,7 @@ class DatePercentile(DatePercentileBase):
|
|
|
246
246
|
)
|
|
247
247
|
return res
|
|
248
248
|
|
|
249
|
-
@
|
|
249
|
+
@field_validator("zero_bounds", mode="before")
|
|
250
250
|
def validate_bounds(cls, value):
|
|
251
251
|
if value is None or isinstance(value, list):
|
|
252
252
|
return value
|
|
@@ -264,7 +264,7 @@ class DatePercentile(DatePercentileBase):
|
|
|
264
264
|
|
|
265
265
|
|
|
266
266
|
class DatePercentileMethod2(DatePercentileBase):
|
|
267
|
-
name = "date_per_method2"
|
|
267
|
+
name: str = "date_per_method2"
|
|
268
268
|
|
|
269
269
|
def _get_bounds(self, date_col: pd.Series) -> pd.Series:
|
|
270
270
|
pass
|
|
@@ -82,9 +82,9 @@ class Feature:
|
|
|
82
82
|
self.alias = alias
|
|
83
83
|
|
|
84
84
|
def set_op_params(self, params: Optional[Dict[str, str]]) -> "Feature":
|
|
85
|
-
obj_dict = self.op.
|
|
85
|
+
obj_dict = self.op.model_dump().copy()
|
|
86
86
|
obj_dict.update(params or {})
|
|
87
|
-
self.op = self.op.__class__.
|
|
87
|
+
self.op = self.op.__class__.model_validate(obj_dict)
|
|
88
88
|
self.op.set_params(params)
|
|
89
89
|
|
|
90
90
|
for child in self.children:
|
|
@@ -7,9 +7,9 @@ from upgini.autofe.operand import PandasOperand, VectorizableMixin
|
|
|
7
7
|
|
|
8
8
|
class GroupByThenAgg(PandasOperand, VectorizableMixin):
|
|
9
9
|
agg: Optional[str]
|
|
10
|
-
is_vectorizable = True
|
|
11
|
-
is_grouping = True
|
|
12
|
-
is_distribution_dependent = True
|
|
10
|
+
is_vectorizable: bool = True
|
|
11
|
+
is_grouping: bool = True
|
|
12
|
+
is_distribution_dependent: bool = True
|
|
13
13
|
|
|
14
14
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
15
15
|
temp = left.groupby(right).agg(self.agg)
|
|
@@ -24,17 +24,17 @@ class GroupByThenAgg(PandasOperand, VectorizableMixin):
|
|
|
24
24
|
|
|
25
25
|
|
|
26
26
|
class GroupByThenMedian(GroupByThenAgg):
|
|
27
|
-
name = "GroupByThenMedian"
|
|
28
|
-
pandas_agg = "median"
|
|
29
|
-
is_distribution_dependent = True
|
|
27
|
+
name: str = "GroupByThenMedian"
|
|
28
|
+
pandas_agg: str = "median"
|
|
29
|
+
is_distribution_dependent: bool = True
|
|
30
30
|
|
|
31
31
|
|
|
32
32
|
class GroupByThenRank(PandasOperand, VectorizableMixin):
|
|
33
|
-
name = "GroupByThenRank"
|
|
34
|
-
is_vectorizable = True
|
|
35
|
-
is_grouping = True
|
|
36
|
-
output_type = "float"
|
|
37
|
-
is_distribution_dependent = True
|
|
33
|
+
name: str = "GroupByThenRank"
|
|
34
|
+
is_vectorizable: bool = True
|
|
35
|
+
is_grouping: bool = True
|
|
36
|
+
output_type: Optional[str] = "float"
|
|
37
|
+
is_distribution_dependent: bool = True
|
|
38
38
|
|
|
39
39
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
40
40
|
temp = pd.DataFrame(left[~right.isna()].groupby(right).rank(ascending=True, pct=True)).reset_index()
|
|
@@ -49,12 +49,12 @@ class GroupByThenRank(PandasOperand, VectorizableMixin):
|
|
|
49
49
|
|
|
50
50
|
|
|
51
51
|
class GroupByThenNUnique(PandasOperand, VectorizableMixin):
|
|
52
|
-
name = "GroupByThenNUnique"
|
|
53
|
-
is_vectorizable = True
|
|
54
|
-
is_grouping = True
|
|
55
|
-
output_type = "int"
|
|
56
|
-
is_distribution_dependent = True
|
|
57
|
-
input_type = "discrete"
|
|
52
|
+
name: str = "GroupByThenNUnique"
|
|
53
|
+
is_vectorizable: bool = True
|
|
54
|
+
is_grouping: bool = True
|
|
55
|
+
output_type: Optional[str] = "int"
|
|
56
|
+
is_distribution_dependent: bool = True
|
|
57
|
+
input_type: Optional[str] = "discrete"
|
|
58
58
|
|
|
59
59
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
60
60
|
nunique = left.groupby(right).nunique()
|
|
@@ -69,11 +69,11 @@ class GroupByThenNUnique(PandasOperand, VectorizableMixin):
|
|
|
69
69
|
|
|
70
70
|
|
|
71
71
|
class GroupByThenFreq(PandasOperand):
|
|
72
|
-
name = "GroupByThenFreq"
|
|
73
|
-
is_grouping = True
|
|
74
|
-
output_type = "float"
|
|
75
|
-
is_distribution_dependent = True
|
|
76
|
-
input_type = "discrete"
|
|
72
|
+
name: str = "GroupByThenFreq"
|
|
73
|
+
is_grouping: bool = True
|
|
74
|
+
output_type: Optional[str] = "float"
|
|
75
|
+
is_distribution_dependent: bool = True
|
|
76
|
+
input_type: Optional[str] = "discrete"
|
|
77
77
|
|
|
78
78
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
79
79
|
def _f(x):
|
|
@@ -8,19 +8,19 @@ from pydantic import BaseModel
|
|
|
8
8
|
|
|
9
9
|
class Operand(BaseModel):
|
|
10
10
|
name: str
|
|
11
|
-
alias: Optional[str]
|
|
11
|
+
alias: Optional[str] = None
|
|
12
12
|
is_unary: bool = False
|
|
13
13
|
is_symmetrical: bool = False
|
|
14
14
|
has_symmetry_importance: bool = False
|
|
15
|
-
input_type: Optional[str]
|
|
16
|
-
output_type: Optional[str]
|
|
15
|
+
input_type: Optional[str] = None
|
|
16
|
+
output_type: Optional[str] = None
|
|
17
17
|
is_categorical: bool = False
|
|
18
18
|
is_vectorizable: bool = False
|
|
19
19
|
is_grouping: bool = False
|
|
20
20
|
is_binary: bool = False
|
|
21
21
|
is_vector: bool = False
|
|
22
22
|
is_distribution_dependent: bool = False
|
|
23
|
-
params: Optional[Dict[str, str]]
|
|
23
|
+
params: Optional[Dict[str, str]] = None
|
|
24
24
|
|
|
25
25
|
def set_params(self, params: Dict[str, str]):
|
|
26
26
|
self.params = params
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
from typing import Optional
|
|
1
2
|
import numpy as np
|
|
2
3
|
import pandas as pd
|
|
3
4
|
from sklearn.preprocessing import Normalizer
|
|
@@ -6,10 +7,10 @@ from upgini.autofe.operand import PandasOperand, VectorizableMixin
|
|
|
6
7
|
|
|
7
8
|
|
|
8
9
|
class Abs(PandasOperand, VectorizableMixin):
|
|
9
|
-
name = "abs"
|
|
10
|
-
is_unary = True
|
|
11
|
-
is_vectorizable = True
|
|
12
|
-
group_index = 0
|
|
10
|
+
name: str = "abs"
|
|
11
|
+
is_unary: bool = True
|
|
12
|
+
is_vectorizable: bool = True
|
|
13
|
+
group_index: int = 0
|
|
13
14
|
|
|
14
15
|
def calculate_unary(self, data: pd.Series) -> pd.Series:
|
|
15
16
|
return data.abs()
|
|
@@ -19,11 +20,11 @@ class Abs(PandasOperand, VectorizableMixin):
|
|
|
19
20
|
|
|
20
21
|
|
|
21
22
|
class Log(PandasOperand, VectorizableMixin):
|
|
22
|
-
name = "log"
|
|
23
|
-
is_unary = True
|
|
24
|
-
is_vectorizable = True
|
|
25
|
-
output_type = "float"
|
|
26
|
-
group_index = 0
|
|
23
|
+
name: str = "log"
|
|
24
|
+
is_unary: bool = True
|
|
25
|
+
is_vectorizable: bool = True
|
|
26
|
+
output_type: Optional[str] = "float"
|
|
27
|
+
group_index: int = 0
|
|
27
28
|
|
|
28
29
|
def calculate_unary(self, data: pd.Series) -> pd.Series:
|
|
29
30
|
return self._round_value(np.log(np.abs(data.replace(0, np.nan))), 10)
|
|
@@ -33,11 +34,11 @@ class Log(PandasOperand, VectorizableMixin):
|
|
|
33
34
|
|
|
34
35
|
|
|
35
36
|
class Sqrt(PandasOperand, VectorizableMixin):
|
|
36
|
-
name = "sqrt"
|
|
37
|
-
is_unary = True
|
|
38
|
-
is_vectorizable = True
|
|
39
|
-
output_type = "float"
|
|
40
|
-
group_index = 0
|
|
37
|
+
name: str = "sqrt"
|
|
38
|
+
is_unary: bool = True
|
|
39
|
+
is_vectorizable: bool = True
|
|
40
|
+
output_type: Optional[str] = "float"
|
|
41
|
+
group_index: int = 0
|
|
41
42
|
|
|
42
43
|
def calculate_unary(self, data: pd.Series) -> pd.Series:
|
|
43
44
|
return self._round_value(np.sqrt(np.abs(data)))
|
|
@@ -47,10 +48,10 @@ class Sqrt(PandasOperand, VectorizableMixin):
|
|
|
47
48
|
|
|
48
49
|
|
|
49
50
|
class Square(PandasOperand, VectorizableMixin):
|
|
50
|
-
name = "square"
|
|
51
|
-
is_unary = True
|
|
52
|
-
is_vectorizable = True
|
|
53
|
-
group_index = 0
|
|
51
|
+
name: str = "square"
|
|
52
|
+
is_unary: bool = True
|
|
53
|
+
is_vectorizable: bool = True
|
|
54
|
+
group_index: int = 0
|
|
54
55
|
|
|
55
56
|
def calculate_unary(self, data: pd.Series) -> pd.Series:
|
|
56
57
|
return np.square(data)
|
|
@@ -60,11 +61,11 @@ class Square(PandasOperand, VectorizableMixin):
|
|
|
60
61
|
|
|
61
62
|
|
|
62
63
|
class Sigmoid(PandasOperand, VectorizableMixin):
|
|
63
|
-
name = "sigmoid"
|
|
64
|
-
is_unary = True
|
|
65
|
-
is_vectorizable = True
|
|
66
|
-
output_type = "float"
|
|
67
|
-
group_index = 0
|
|
64
|
+
name: str = "sigmoid"
|
|
65
|
+
is_unary: bool = True
|
|
66
|
+
is_vectorizable: bool = True
|
|
67
|
+
output_type: Optional[str] = "float"
|
|
68
|
+
group_index: int = 0
|
|
68
69
|
|
|
69
70
|
def calculate_unary(self, data: pd.Series) -> pd.Series:
|
|
70
71
|
return self._round_value(1 / (1 + np.exp(-data)))
|
|
@@ -74,12 +75,12 @@ class Sigmoid(PandasOperand, VectorizableMixin):
|
|
|
74
75
|
|
|
75
76
|
|
|
76
77
|
class Floor(PandasOperand, VectorizableMixin):
|
|
77
|
-
name = "floor"
|
|
78
|
-
is_unary = True
|
|
79
|
-
is_vectorizable = True
|
|
80
|
-
output_type = "int"
|
|
81
|
-
input_type = "continuous"
|
|
82
|
-
group_index = 0
|
|
78
|
+
name: str = "floor"
|
|
79
|
+
is_unary: bool = True
|
|
80
|
+
is_vectorizable: bool = True
|
|
81
|
+
output_type: Optional[str] = "int"
|
|
82
|
+
input_type: Optional[str] = "continuous"
|
|
83
|
+
group_index: int = 0
|
|
83
84
|
|
|
84
85
|
def calculate_unary(self, data: pd.Series) -> pd.Series:
|
|
85
86
|
return np.floor(data)
|
|
@@ -89,11 +90,11 @@ class Floor(PandasOperand, VectorizableMixin):
|
|
|
89
90
|
|
|
90
91
|
|
|
91
92
|
class Residual(PandasOperand, VectorizableMixin):
|
|
92
|
-
name = "residual"
|
|
93
|
-
is_unary = True
|
|
94
|
-
is_vectorizable = True
|
|
95
|
-
input_type = "continuous"
|
|
96
|
-
group_index = 0
|
|
93
|
+
name: str = "residual"
|
|
94
|
+
is_unary: bool = True
|
|
95
|
+
is_vectorizable: bool = True
|
|
96
|
+
input_type: Optional[str] = "continuous"
|
|
97
|
+
group_index: int = 0
|
|
97
98
|
|
|
98
99
|
def calculate_unary(self, data: pd.Series) -> pd.Series:
|
|
99
100
|
return data - np.floor(data)
|
|
@@ -103,11 +104,11 @@ class Residual(PandasOperand, VectorizableMixin):
|
|
|
103
104
|
|
|
104
105
|
|
|
105
106
|
class Freq(PandasOperand):
|
|
106
|
-
name = "freq"
|
|
107
|
-
is_unary = True
|
|
108
|
-
output_type = "float"
|
|
109
|
-
is_distribution_dependent = True
|
|
110
|
-
input_type = "discrete"
|
|
107
|
+
name: str = "freq"
|
|
108
|
+
is_unary: bool = True
|
|
109
|
+
output_type: Optional[str] = "float"
|
|
110
|
+
is_distribution_dependent: bool = True
|
|
111
|
+
input_type: Optional[str] = "discrete"
|
|
111
112
|
|
|
112
113
|
def calculate_unary(self, data: pd.Series) -> pd.Series:
|
|
113
114
|
value_counts = data.value_counts(normalize=True)
|
|
@@ -115,9 +116,9 @@ class Freq(PandasOperand):
|
|
|
115
116
|
|
|
116
117
|
|
|
117
118
|
class Norm(PandasOperand):
|
|
118
|
-
name = "norm"
|
|
119
|
-
is_unary = True
|
|
120
|
-
output_type = "float"
|
|
119
|
+
name: str = "norm"
|
|
120
|
+
is_unary: bool = True
|
|
121
|
+
output_type: Optional[str] = "float"
|
|
121
122
|
|
|
122
123
|
def calculate_unary(self, data: pd.Series) -> pd.Series:
|
|
123
124
|
data_dropna = data.dropna()
|
|
@@ -131,7 +132,7 @@ class Norm(PandasOperand):
|
|
|
131
132
|
|
|
132
133
|
|
|
133
134
|
class Embeddings(PandasOperand):
|
|
134
|
-
name = "emb"
|
|
135
|
-
is_unary = True
|
|
136
|
-
input_type = "string"
|
|
137
|
-
output_type = "vector"
|
|
135
|
+
name: str = "emb"
|
|
136
|
+
is_unary: bool = True
|
|
137
|
+
input_type: Optional[str] = "string"
|
|
138
|
+
output_type: Optional[str] = "vector"
|