upgini 1.1.316a5__tar.gz → 1.1.317__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- {upgini-1.1.316a5 → upgini-1.1.317}/PKG-INFO +3 -3
- {upgini-1.1.316a5 → upgini-1.1.317}/pyproject.toml +11 -13
- upgini-1.1.317/src/upgini/__about__.py +1 -0
- {upgini-1.1.316a5 → upgini-1.1.317}/src/upgini/autofe/binary.py +75 -72
- {upgini-1.1.316a5 → upgini-1.1.317}/src/upgini/autofe/date.py +26 -43
- {upgini-1.1.316a5 → upgini-1.1.317}/src/upgini/autofe/groupby.py +22 -22
- {upgini-1.1.316a5 → upgini-1.1.317}/src/upgini/autofe/operand.py +4 -4
- {upgini-1.1.316a5 → upgini-1.1.317}/src/upgini/autofe/unary.py +46 -47
- {upgini-1.1.316a5 → upgini-1.1.317}/src/upgini/autofe/vector.py +8 -8
- {upgini-1.1.316a5 → upgini-1.1.317}/src/upgini/dataset.py +3 -8
- {upgini-1.1.316a5 → upgini-1.1.317}/src/upgini/features_enricher.py +4 -5
- {upgini-1.1.316a5 → upgini-1.1.317}/src/upgini/http.py +15 -15
- {upgini-1.1.316a5 → upgini-1.1.317}/src/upgini/lazy_import.py +1 -14
- {upgini-1.1.316a5 → upgini-1.1.317}/src/upgini/metadata.py +57 -57
- {upgini-1.1.316a5 → upgini-1.1.317}/src/upgini/normalizer/normalize_utils.py +2 -1
- {upgini-1.1.316a5 → upgini-1.1.317}/src/upgini/utils/datetime_utils.py +5 -5
- {upgini-1.1.316a5 → upgini-1.1.317}/src/upgini/utils/phone_utils.py +7 -5
- {upgini-1.1.316a5 → upgini-1.1.317}/src/upgini/utils/postal_code_utils.py +1 -1
- {upgini-1.1.316a5 → upgini-1.1.317}/src/upgini/utils/target_utils.py +1 -4
- upgini-1.1.316a5/src/upgini/__about__.py +0 -1
- {upgini-1.1.316a5 → upgini-1.1.317}/.gitignore +0 -0
- {upgini-1.1.316a5 → upgini-1.1.317}/LICENSE +0 -0
- {upgini-1.1.316a5 → upgini-1.1.317}/README.md +0 -0
- {upgini-1.1.316a5 → upgini-1.1.317}/src/upgini/__init__.py +0 -0
- {upgini-1.1.316a5 → upgini-1.1.317}/src/upgini/ads.py +0 -0
- {upgini-1.1.316a5 → upgini-1.1.317}/src/upgini/ads_management/__init__.py +0 -0
- {upgini-1.1.316a5 → upgini-1.1.317}/src/upgini/ads_management/ads_manager.py +0 -0
- {upgini-1.1.316a5 → upgini-1.1.317}/src/upgini/autofe/__init__.py +0 -0
- {upgini-1.1.316a5 → upgini-1.1.317}/src/upgini/autofe/all_operands.py +0 -0
- {upgini-1.1.316a5 → upgini-1.1.317}/src/upgini/autofe/feature.py +0 -0
- {upgini-1.1.316a5 → upgini-1.1.317}/src/upgini/data_source/__init__.py +0 -0
- {upgini-1.1.316a5 → upgini-1.1.317}/src/upgini/data_source/data_source_publisher.py +0 -0
- {upgini-1.1.316a5 → upgini-1.1.317}/src/upgini/errors.py +0 -0
- {upgini-1.1.316a5 → upgini-1.1.317}/src/upgini/mdc/__init__.py +0 -0
- {upgini-1.1.316a5 → upgini-1.1.317}/src/upgini/mdc/context.py +0 -0
- {upgini-1.1.316a5 → upgini-1.1.317}/src/upgini/metrics.py +0 -0
- {upgini-1.1.316a5 → upgini-1.1.317}/src/upgini/normalizer/__init__.py +0 -0
- {upgini-1.1.316a5 → upgini-1.1.317}/src/upgini/resource_bundle/__init__.py +0 -0
- {upgini-1.1.316a5 → upgini-1.1.317}/src/upgini/resource_bundle/exceptions.py +0 -0
- {upgini-1.1.316a5 → upgini-1.1.317}/src/upgini/resource_bundle/strings.properties +0 -0
- {upgini-1.1.316a5 → upgini-1.1.317}/src/upgini/resource_bundle/strings_widget.properties +0 -0
- {upgini-1.1.316a5 → upgini-1.1.317}/src/upgini/sampler/__init__.py +0 -0
- {upgini-1.1.316a5 → upgini-1.1.317}/src/upgini/sampler/base.py +0 -0
- {upgini-1.1.316a5 → upgini-1.1.317}/src/upgini/sampler/random_under_sampler.py +0 -0
- {upgini-1.1.316a5 → upgini-1.1.317}/src/upgini/sampler/utils.py +0 -0
- {upgini-1.1.316a5 → upgini-1.1.317}/src/upgini/search_task.py +0 -0
- {upgini-1.1.316a5 → upgini-1.1.317}/src/upgini/spinner.py +0 -0
- {upgini-1.1.316a5 → upgini-1.1.317}/src/upgini/utils/__init__.py +0 -0
- {upgini-1.1.316a5 → upgini-1.1.317}/src/upgini/utils/base_search_key_detector.py +0 -0
- {upgini-1.1.316a5 → upgini-1.1.317}/src/upgini/utils/blocked_time_series.py +0 -0
- {upgini-1.1.316a5 → upgini-1.1.317}/src/upgini/utils/country_utils.py +0 -0
- {upgini-1.1.316a5 → upgini-1.1.317}/src/upgini/utils/custom_loss_utils.py +0 -0
- {upgini-1.1.316a5 → upgini-1.1.317}/src/upgini/utils/cv_utils.py +0 -0
- {upgini-1.1.316a5 → upgini-1.1.317}/src/upgini/utils/deduplicate_utils.py +0 -0
- {upgini-1.1.316a5 → upgini-1.1.317}/src/upgini/utils/display_utils.py +0 -0
- {upgini-1.1.316a5 → upgini-1.1.317}/src/upgini/utils/email_utils.py +0 -0
- {upgini-1.1.316a5 → upgini-1.1.317}/src/upgini/utils/fallback_progress_bar.py +0 -0
- {upgini-1.1.316a5 → upgini-1.1.317}/src/upgini/utils/features_validator.py +0 -0
- {upgini-1.1.316a5 → upgini-1.1.317}/src/upgini/utils/format.py +0 -0
- {upgini-1.1.316a5 → upgini-1.1.317}/src/upgini/utils/ip_utils.py +0 -0
- {upgini-1.1.316a5 → upgini-1.1.317}/src/upgini/utils/progress_bar.py +0 -0
- {upgini-1.1.316a5 → upgini-1.1.317}/src/upgini/utils/sklearn_ext.py +0 -0
- {upgini-1.1.316a5 → upgini-1.1.317}/src/upgini/utils/track_info.py +0 -0
- {upgini-1.1.316a5 → upgini-1.1.317}/src/upgini/utils/warning_counter.py +0 -0
- {upgini-1.1.316a5 → upgini-1.1.317}/src/upgini/version_validator.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: upgini
|
|
3
|
-
Version: 1.1.
|
|
3
|
+
Version: 1.1.317
|
|
4
4
|
Summary: Intelligent data search & enrichment for Machine Learning
|
|
5
5
|
Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
|
|
6
6
|
Project-URL: Homepage, https://upgini.com/
|
|
@@ -29,9 +29,9 @@ Requires-Dist: ipywidgets>=8.1.0
|
|
|
29
29
|
Requires-Dist: jarowinkler>=2.0.0
|
|
30
30
|
Requires-Dist: levenshtein>=0.25.1
|
|
31
31
|
Requires-Dist: lightgbm>=3.3.2
|
|
32
|
-
Requires-Dist: numpy
|
|
32
|
+
Requires-Dist: numpy>=1.19.0
|
|
33
33
|
Requires-Dist: pandas<3.0.0,>=1.1.0
|
|
34
|
-
Requires-Dist: pydantic<
|
|
34
|
+
Requires-Dist: pydantic<2.0.0,>=1.8.2
|
|
35
35
|
Requires-Dist: pyjwt>=2.8.0
|
|
36
36
|
Requires-Dist: python-bidi==0.4.2
|
|
37
37
|
Requires-Dist: python-dateutil>=2.8.0
|
|
@@ -39,9 +39,9 @@ dependencies = [
|
|
|
39
39
|
"fastparquet>=0.8.1",
|
|
40
40
|
"ipywidgets>=8.1.0",
|
|
41
41
|
"lightgbm>=3.3.2",
|
|
42
|
-
"numpy>=1.19.0
|
|
42
|
+
"numpy>=1.19.0",
|
|
43
43
|
"pandas>=1.1.0,<3.0.0",
|
|
44
|
-
"pydantic
|
|
44
|
+
"pydantic>=1.8.2,<2.0.0",
|
|
45
45
|
"pyjwt>=2.8.0",
|
|
46
46
|
"python-dateutil>=2.8.0",
|
|
47
47
|
"python-json-logger>=2.0.2",
|
|
@@ -79,15 +79,15 @@ python = "3.10"
|
|
|
79
79
|
cov = 'pytest --cov-report=term-missing --cov-config=pyproject.toml --cov=upgini --cov=tests'
|
|
80
80
|
format = "black {args}"
|
|
81
81
|
lint = "ruff check {args}"
|
|
82
|
-
|
|
82
|
+
test_binary = 'pytest -s -vv tests/test_binary_dataset.py'
|
|
83
83
|
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
84
|
+
[[tool.hatch.envs.test.matrix]]
|
|
85
|
+
python = ["3.8"]
|
|
86
|
+
pandas = ["1.1.0"]
|
|
87
87
|
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
88
|
+
[[tool.hatch.envs.test.matrix]]
|
|
89
|
+
python = ["3.8", "3.9", "3.10"]
|
|
90
|
+
pandas = ["1.2.0", "1.3.0", "1.4.0", "1.5.0", "2.0.0"]
|
|
91
91
|
|
|
92
92
|
[[tool.hatch.envs.test.matrix]]
|
|
93
93
|
python = ["3.9", "3.10"]
|
|
@@ -103,8 +103,7 @@ dependencies = [
|
|
|
103
103
|
# "pytest-timeout",
|
|
104
104
|
"requests-mock",
|
|
105
105
|
"pytest-datafiles",
|
|
106
|
-
"
|
|
107
|
-
"pandas~={matrix:pandas}",
|
|
106
|
+
"pandas~={matrix:pandas}.0",
|
|
108
107
|
]
|
|
109
108
|
|
|
110
109
|
[tool.black]
|
|
@@ -116,5 +115,4 @@ profile = "black"
|
|
|
116
115
|
[tool.pytest.ini_options]
|
|
117
116
|
pythonpath = [
|
|
118
117
|
"./src"
|
|
119
|
-
]
|
|
120
|
-
addopts="-n 4"
|
|
118
|
+
]
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "1.1.317"
|
|
@@ -9,32 +9,32 @@ from upgini.autofe.operand import PandasOperand, VectorizableMixin
|
|
|
9
9
|
|
|
10
10
|
|
|
11
11
|
class Min(PandasOperand):
|
|
12
|
-
name
|
|
13
|
-
is_binary
|
|
14
|
-
is_symmetrical
|
|
15
|
-
has_symmetry_importance
|
|
12
|
+
name = "min"
|
|
13
|
+
is_binary = True
|
|
14
|
+
is_symmetrical = True
|
|
15
|
+
has_symmetry_importance = True
|
|
16
16
|
|
|
17
17
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
18
18
|
return np.minimum(left, right)
|
|
19
19
|
|
|
20
20
|
|
|
21
21
|
class Max(PandasOperand):
|
|
22
|
-
name
|
|
23
|
-
is_binary
|
|
24
|
-
is_symmetrical
|
|
25
|
-
has_symmetry_importance
|
|
22
|
+
name = "max"
|
|
23
|
+
is_binary = True
|
|
24
|
+
is_symmetrical = True
|
|
25
|
+
has_symmetry_importance = True
|
|
26
26
|
|
|
27
27
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
28
28
|
return np.maximum(left, right)
|
|
29
29
|
|
|
30
30
|
|
|
31
31
|
class Add(PandasOperand, VectorizableMixin):
|
|
32
|
-
name
|
|
33
|
-
alias
|
|
34
|
-
is_binary
|
|
35
|
-
is_symmetrical
|
|
36
|
-
has_symmetry_importance
|
|
37
|
-
is_vectorizable
|
|
32
|
+
name = "+"
|
|
33
|
+
alias = "add"
|
|
34
|
+
is_binary = True
|
|
35
|
+
is_symmetrical = True
|
|
36
|
+
has_symmetry_importance = True
|
|
37
|
+
is_vectorizable = True
|
|
38
38
|
|
|
39
39
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
40
40
|
return left + right
|
|
@@ -48,12 +48,12 @@ class Add(PandasOperand, VectorizableMixin):
|
|
|
48
48
|
|
|
49
49
|
|
|
50
50
|
class Subtract(PandasOperand, VectorizableMixin):
|
|
51
|
-
name
|
|
52
|
-
alias
|
|
53
|
-
is_binary
|
|
54
|
-
is_symmetrical
|
|
55
|
-
has_symmetry_importance
|
|
56
|
-
is_vectorizable
|
|
51
|
+
name = "-"
|
|
52
|
+
alias = "sub"
|
|
53
|
+
is_binary = True
|
|
54
|
+
is_symmetrical = True
|
|
55
|
+
has_symmetry_importance = True
|
|
56
|
+
is_vectorizable = True
|
|
57
57
|
|
|
58
58
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
59
59
|
return left - right
|
|
@@ -67,12 +67,12 @@ class Subtract(PandasOperand, VectorizableMixin):
|
|
|
67
67
|
|
|
68
68
|
|
|
69
69
|
class Multiply(PandasOperand, VectorizableMixin):
|
|
70
|
-
name
|
|
71
|
-
alias
|
|
72
|
-
is_binary
|
|
73
|
-
is_symmetrical
|
|
74
|
-
has_symmetry_importance
|
|
75
|
-
is_vectorizable
|
|
70
|
+
name = "*"
|
|
71
|
+
alias = "mul"
|
|
72
|
+
is_binary = True
|
|
73
|
+
is_symmetrical = True
|
|
74
|
+
has_symmetry_importance = True
|
|
75
|
+
is_vectorizable = True
|
|
76
76
|
|
|
77
77
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
78
78
|
return left * right
|
|
@@ -86,12 +86,12 @@ class Multiply(PandasOperand, VectorizableMixin):
|
|
|
86
86
|
|
|
87
87
|
|
|
88
88
|
class Divide(PandasOperand, VectorizableMixin):
|
|
89
|
-
name
|
|
90
|
-
alias
|
|
91
|
-
is_binary
|
|
92
|
-
has_symmetry_importance
|
|
93
|
-
is_vectorizable
|
|
94
|
-
output_type
|
|
89
|
+
name = "/"
|
|
90
|
+
alias = "div"
|
|
91
|
+
is_binary = True
|
|
92
|
+
has_symmetry_importance = True
|
|
93
|
+
is_vectorizable = True
|
|
94
|
+
output_type = "float"
|
|
95
95
|
|
|
96
96
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
97
97
|
return left / right.replace(0, np.nan)
|
|
@@ -105,10 +105,10 @@ class Divide(PandasOperand, VectorizableMixin):
|
|
|
105
105
|
|
|
106
106
|
|
|
107
107
|
class Combine(PandasOperand):
|
|
108
|
-
name
|
|
109
|
-
is_binary
|
|
110
|
-
has_symmetry_importance
|
|
111
|
-
output_type
|
|
108
|
+
name = "Combine"
|
|
109
|
+
is_binary = True
|
|
110
|
+
has_symmetry_importance = True
|
|
111
|
+
output_type = "object"
|
|
112
112
|
|
|
113
113
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
114
114
|
temp = left.astype(str) + "_" + right.astype(str)
|
|
@@ -117,13 +117,13 @@ class Combine(PandasOperand):
|
|
|
117
117
|
|
|
118
118
|
|
|
119
119
|
class CombineThenFreq(PandasOperand):
|
|
120
|
-
name
|
|
121
|
-
is_binary
|
|
122
|
-
is_symmetrical
|
|
123
|
-
has_symmetry_importance
|
|
124
|
-
output_type
|
|
125
|
-
is_distribution_dependent
|
|
126
|
-
input_type
|
|
120
|
+
name = "CombineThenFreq"
|
|
121
|
+
is_binary = True
|
|
122
|
+
is_symmetrical = True
|
|
123
|
+
has_symmetry_importance = True
|
|
124
|
+
output_type = "float"
|
|
125
|
+
is_distribution_dependent = True
|
|
126
|
+
input_type = "discrete"
|
|
127
127
|
|
|
128
128
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
129
129
|
temp = left.astype(str) + "_" + right.astype(str)
|
|
@@ -133,15 +133,15 @@ class CombineThenFreq(PandasOperand):
|
|
|
133
133
|
|
|
134
134
|
|
|
135
135
|
class Distance(PandasOperand):
|
|
136
|
-
name
|
|
137
|
-
is_binary
|
|
138
|
-
output_type
|
|
139
|
-
is_symmetrical
|
|
140
|
-
has_symmetry_importance
|
|
136
|
+
name = "dist"
|
|
137
|
+
is_binary = True
|
|
138
|
+
output_type = "float"
|
|
139
|
+
is_symmetrical = True
|
|
140
|
+
has_symmetry_importance = True
|
|
141
141
|
|
|
142
142
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
143
143
|
return pd.Series(
|
|
144
|
-
1 - self.__dot(left, right) / (self.
|
|
144
|
+
1 - self.__dot(left, right) / (self.__norm(left) * self.__norm(right)), index=left.index
|
|
145
145
|
)
|
|
146
146
|
|
|
147
147
|
# row-wise dot product
|
|
@@ -152,14 +152,17 @@ class Distance(PandasOperand):
|
|
|
152
152
|
res = res.reindex(left.index.union(right.index))
|
|
153
153
|
return res
|
|
154
154
|
|
|
155
|
+
def __norm(self, vector: pd.Series) -> pd.Series:
|
|
156
|
+
return np.sqrt(self.__dot(vector, vector))
|
|
157
|
+
|
|
155
158
|
|
|
156
159
|
# Left for backward compatibility
|
|
157
160
|
class Sim(Distance):
|
|
158
|
-
name
|
|
159
|
-
is_binary
|
|
160
|
-
output_type
|
|
161
|
-
is_symmetrical
|
|
162
|
-
has_symmetry_importance
|
|
161
|
+
name = "sim"
|
|
162
|
+
is_binary = True
|
|
163
|
+
output_type = "float"
|
|
164
|
+
is_symmetrical = True
|
|
165
|
+
has_symmetry_importance = True
|
|
163
166
|
|
|
164
167
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
165
168
|
return 1 - super().calculate_binary(left, right)
|
|
@@ -188,12 +191,12 @@ class StringSim(PandasOperand, abc.ABC):
|
|
|
188
191
|
|
|
189
192
|
|
|
190
193
|
class JaroWinklerSim1(StringSim):
|
|
191
|
-
name
|
|
192
|
-
is_binary
|
|
193
|
-
input_type
|
|
194
|
-
output_type
|
|
195
|
-
is_symmetrical
|
|
196
|
-
has_symmetry_importance
|
|
194
|
+
name = "sim_jw1"
|
|
195
|
+
is_binary = True
|
|
196
|
+
input_type = "string"
|
|
197
|
+
output_type = "float"
|
|
198
|
+
is_symmetrical = True
|
|
199
|
+
has_symmetry_importance = True
|
|
197
200
|
|
|
198
201
|
def _prepare_value(self, value: Optional[str]) -> Optional[str]:
|
|
199
202
|
return value
|
|
@@ -203,12 +206,12 @@ class JaroWinklerSim1(StringSim):
|
|
|
203
206
|
|
|
204
207
|
|
|
205
208
|
class JaroWinklerSim2(StringSim):
|
|
206
|
-
name
|
|
207
|
-
is_binary
|
|
208
|
-
input_type
|
|
209
|
-
output_type
|
|
210
|
-
is_symmetrical
|
|
211
|
-
has_symmetry_importance
|
|
209
|
+
name = "sim_jw2"
|
|
210
|
+
is_binary = True
|
|
211
|
+
input_type = "string"
|
|
212
|
+
output_type = "float"
|
|
213
|
+
is_symmetrical = True
|
|
214
|
+
has_symmetry_importance = True
|
|
212
215
|
|
|
213
216
|
def _prepare_value(self, value: Optional[str]) -> Optional[str]:
|
|
214
217
|
return value[::-1] if value is not None else None
|
|
@@ -218,12 +221,12 @@ class JaroWinklerSim2(StringSim):
|
|
|
218
221
|
|
|
219
222
|
|
|
220
223
|
class LevenshteinSim(StringSim):
|
|
221
|
-
name
|
|
222
|
-
is_binary
|
|
223
|
-
input_type
|
|
224
|
-
output_type
|
|
225
|
-
is_symmetrical
|
|
226
|
-
has_symmetry_importance
|
|
224
|
+
name = "sim_lv"
|
|
225
|
+
is_binary = True
|
|
226
|
+
input_type = "string"
|
|
227
|
+
output_type = "float"
|
|
228
|
+
is_symmetrical = True
|
|
229
|
+
has_symmetry_importance = True
|
|
227
230
|
|
|
228
231
|
def _prepare_value(self, value: Optional[str]) -> Optional[str]:
|
|
229
232
|
return value
|
|
@@ -1,19 +1,15 @@
|
|
|
1
1
|
import abc
|
|
2
|
+
import json
|
|
2
3
|
from typing import Any, Dict, List, Optional, Union
|
|
3
4
|
|
|
4
5
|
import numpy as np
|
|
5
6
|
import pandas as pd
|
|
6
7
|
from pandas.core.arrays.timedeltas import TimedeltaArray
|
|
7
|
-
from pydantic import BaseModel,
|
|
8
|
+
from pydantic import BaseModel, validator
|
|
8
9
|
|
|
9
10
|
from upgini.autofe.operand import PandasOperand
|
|
10
11
|
|
|
11
12
|
|
|
12
|
-
def get_pydantic_version():
|
|
13
|
-
major_version = int(pydantic_version.split('.')[0])
|
|
14
|
-
return major_version
|
|
15
|
-
|
|
16
|
-
|
|
17
13
|
class DateDiffMixin(BaseModel):
|
|
18
14
|
diff_unit: str = "D"
|
|
19
15
|
left_unit: Optional[str] = None
|
|
@@ -43,10 +39,10 @@ class DateDiffMixin(BaseModel):
|
|
|
43
39
|
|
|
44
40
|
|
|
45
41
|
class DateDiff(PandasOperand, DateDiffMixin):
|
|
46
|
-
name
|
|
47
|
-
alias
|
|
48
|
-
is_binary
|
|
49
|
-
has_symmetry_importance
|
|
42
|
+
name = "date_diff"
|
|
43
|
+
alias = "date_diff_type1"
|
|
44
|
+
is_binary = True
|
|
45
|
+
has_symmetry_importance = True
|
|
50
46
|
|
|
51
47
|
replace_negative: bool = False
|
|
52
48
|
|
|
@@ -75,9 +71,9 @@ class DateDiff(PandasOperand, DateDiffMixin):
|
|
|
75
71
|
|
|
76
72
|
|
|
77
73
|
class DateDiffType2(PandasOperand, DateDiffMixin):
|
|
78
|
-
name
|
|
79
|
-
is_binary
|
|
80
|
-
has_symmetry_importance
|
|
74
|
+
name = "date_diff_type2"
|
|
75
|
+
is_binary = True
|
|
76
|
+
has_symmetry_importance = True
|
|
81
77
|
|
|
82
78
|
def get_params(self) -> Dict[str, Optional[str]]:
|
|
83
79
|
res = super().get_params()
|
|
@@ -109,8 +105,8 @@ _count_aggregations = ["nunique", "count"]
|
|
|
109
105
|
|
|
110
106
|
|
|
111
107
|
class DateListDiff(PandasOperand, DateDiffMixin):
|
|
112
|
-
is_binary
|
|
113
|
-
has_symmetry_importance
|
|
108
|
+
is_binary = True
|
|
109
|
+
has_symmetry_importance = True
|
|
114
110
|
|
|
115
111
|
aggregation: str
|
|
116
112
|
replace_negative: bool = False
|
|
@@ -170,8 +166,8 @@ class DateListDiff(PandasOperand, DateDiffMixin):
|
|
|
170
166
|
|
|
171
167
|
|
|
172
168
|
class DateListDiffBounded(DateListDiff):
|
|
173
|
-
lower_bound: Optional[int]
|
|
174
|
-
upper_bound: Optional[int]
|
|
169
|
+
lower_bound: Optional[int]
|
|
170
|
+
upper_bound: Optional[int]
|
|
175
171
|
|
|
176
172
|
def __init__(self, **data: Any) -> None:
|
|
177
173
|
if "name" not in data:
|
|
@@ -196,8 +192,8 @@ class DateListDiffBounded(DateListDiff):
|
|
|
196
192
|
|
|
197
193
|
|
|
198
194
|
class DatePercentileBase(PandasOperand, abc.ABC):
|
|
199
|
-
is_binary
|
|
200
|
-
output_type
|
|
195
|
+
is_binary = True
|
|
196
|
+
output_type = "float"
|
|
201
197
|
|
|
202
198
|
date_unit: Optional[str] = None
|
|
203
199
|
|
|
@@ -231,12 +227,12 @@ class DatePercentileBase(PandasOperand, abc.ABC):
|
|
|
231
227
|
|
|
232
228
|
|
|
233
229
|
class DatePercentile(DatePercentileBase):
|
|
234
|
-
name
|
|
235
|
-
alias
|
|
230
|
+
name = "date_per"
|
|
231
|
+
alias = "date_per_method1"
|
|
236
232
|
|
|
237
|
-
zero_month: Optional[int]
|
|
238
|
-
zero_year: Optional[int]
|
|
239
|
-
zero_bounds: Optional[List[float]]
|
|
233
|
+
zero_month: Optional[int]
|
|
234
|
+
zero_year: Optional[int]
|
|
235
|
+
zero_bounds: Optional[List[float]]
|
|
240
236
|
step: int = 30
|
|
241
237
|
|
|
242
238
|
def get_params(self) -> Dict[str, Optional[str]]:
|
|
@@ -251,25 +247,12 @@ class DatePercentile(DatePercentileBase):
|
|
|
251
247
|
)
|
|
252
248
|
return res
|
|
253
249
|
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
from pydantic import field_validator
|
|
258
|
-
|
|
259
|
-
@field_validator('zero_bounds', mode='before')
|
|
260
|
-
def parse_zero_bounds(cls, value):
|
|
261
|
-
if isinstance(value, str):
|
|
262
|
-
return value[1:-1].split(", ")
|
|
263
|
-
return value
|
|
264
|
-
else:
|
|
265
|
-
# Use @validator for Pydantic 1.x
|
|
266
|
-
from pydantic import validator
|
|
267
|
-
|
|
268
|
-
@validator('zero_bounds', pre=True)
|
|
269
|
-
def parse_zero_bounds(cls, value):
|
|
270
|
-
if isinstance(value, str):
|
|
271
|
-
return value[1:-1].split(", ")
|
|
250
|
+
@validator("zero_bounds", pre=True)
|
|
251
|
+
def validate_bounds(cls, value):
|
|
252
|
+
if value is None or isinstance(value, list):
|
|
272
253
|
return value
|
|
254
|
+
elif isinstance(value, str):
|
|
255
|
+
return json.loads(value)
|
|
273
256
|
|
|
274
257
|
def _get_bounds(self, date_col: pd.Series) -> pd.Series:
|
|
275
258
|
months = date_col.dt.month
|
|
@@ -282,7 +265,7 @@ class DatePercentile(DatePercentileBase):
|
|
|
282
265
|
|
|
283
266
|
|
|
284
267
|
class DatePercentileMethod2(DatePercentileBase):
|
|
285
|
-
name
|
|
268
|
+
name = "date_per_method2"
|
|
286
269
|
|
|
287
270
|
def _get_bounds(self, date_col: pd.Series) -> pd.Series:
|
|
288
271
|
pass
|
|
@@ -7,9 +7,9 @@ from upgini.autofe.operand import PandasOperand, VectorizableMixin
|
|
|
7
7
|
|
|
8
8
|
class GroupByThenAgg(PandasOperand, VectorizableMixin):
|
|
9
9
|
agg: Optional[str]
|
|
10
|
-
is_vectorizable
|
|
11
|
-
is_grouping
|
|
12
|
-
is_distribution_dependent
|
|
10
|
+
is_vectorizable = True
|
|
11
|
+
is_grouping = True
|
|
12
|
+
is_distribution_dependent = True
|
|
13
13
|
|
|
14
14
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
15
15
|
temp = left.groupby(right).agg(self.agg)
|
|
@@ -24,17 +24,17 @@ class GroupByThenAgg(PandasOperand, VectorizableMixin):
|
|
|
24
24
|
|
|
25
25
|
|
|
26
26
|
class GroupByThenMedian(GroupByThenAgg):
|
|
27
|
-
name
|
|
28
|
-
pandas_agg
|
|
29
|
-
is_distribution_dependent
|
|
27
|
+
name = "GroupByThenMedian"
|
|
28
|
+
pandas_agg = "median"
|
|
29
|
+
is_distribution_dependent = True
|
|
30
30
|
|
|
31
31
|
|
|
32
32
|
class GroupByThenRank(PandasOperand, VectorizableMixin):
|
|
33
|
-
name
|
|
34
|
-
is_vectorizable
|
|
35
|
-
is_grouping
|
|
36
|
-
output_type
|
|
37
|
-
is_distribution_dependent
|
|
33
|
+
name = "GroupByThenRank"
|
|
34
|
+
is_vectorizable = True
|
|
35
|
+
is_grouping = True
|
|
36
|
+
output_type = "float"
|
|
37
|
+
is_distribution_dependent = True
|
|
38
38
|
|
|
39
39
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
40
40
|
temp = pd.DataFrame(left[~right.isna()].groupby(right).rank(ascending=True, pct=True)).reset_index()
|
|
@@ -49,12 +49,12 @@ class GroupByThenRank(PandasOperand, VectorizableMixin):
|
|
|
49
49
|
|
|
50
50
|
|
|
51
51
|
class GroupByThenNUnique(PandasOperand, VectorizableMixin):
|
|
52
|
-
name
|
|
53
|
-
is_vectorizable
|
|
54
|
-
is_grouping
|
|
55
|
-
output_type
|
|
56
|
-
is_distribution_dependent
|
|
57
|
-
input_type
|
|
52
|
+
name = "GroupByThenNUnique"
|
|
53
|
+
is_vectorizable = True
|
|
54
|
+
is_grouping = True
|
|
55
|
+
output_type = "int"
|
|
56
|
+
is_distribution_dependent = True
|
|
57
|
+
input_type = "discrete"
|
|
58
58
|
|
|
59
59
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
60
60
|
nunique = left.groupby(right).nunique()
|
|
@@ -69,11 +69,11 @@ class GroupByThenNUnique(PandasOperand, VectorizableMixin):
|
|
|
69
69
|
|
|
70
70
|
|
|
71
71
|
class GroupByThenFreq(PandasOperand):
|
|
72
|
-
name
|
|
73
|
-
is_grouping
|
|
74
|
-
output_type
|
|
75
|
-
is_distribution_dependent
|
|
76
|
-
input_type
|
|
72
|
+
name = "GroupByThenFreq"
|
|
73
|
+
is_grouping = True
|
|
74
|
+
output_type = "float"
|
|
75
|
+
is_distribution_dependent = True
|
|
76
|
+
input_type = "discrete"
|
|
77
77
|
|
|
78
78
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
79
79
|
def _f(x):
|
|
@@ -8,19 +8,19 @@ from pydantic import BaseModel
|
|
|
8
8
|
|
|
9
9
|
class Operand(BaseModel):
|
|
10
10
|
name: str
|
|
11
|
-
alias: Optional[str]
|
|
11
|
+
alias: Optional[str]
|
|
12
12
|
is_unary: bool = False
|
|
13
13
|
is_symmetrical: bool = False
|
|
14
14
|
has_symmetry_importance: bool = False
|
|
15
|
-
input_type: Optional[str]
|
|
16
|
-
output_type: Optional[str]
|
|
15
|
+
input_type: Optional[str]
|
|
16
|
+
output_type: Optional[str]
|
|
17
17
|
is_categorical: bool = False
|
|
18
18
|
is_vectorizable: bool = False
|
|
19
19
|
is_grouping: bool = False
|
|
20
20
|
is_binary: bool = False
|
|
21
21
|
is_vector: bool = False
|
|
22
22
|
is_distribution_dependent: bool = False
|
|
23
|
-
params: Optional[Dict[str, str]]
|
|
23
|
+
params: Optional[Dict[str, str]]
|
|
24
24
|
|
|
25
25
|
def set_params(self, params: Dict[str, str]):
|
|
26
26
|
self.params = params
|