upgini 1.1.317__tar.gz → 1.2.0a1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- {upgini-1.1.317 → upgini-1.2.0a1}/PKG-INFO +3 -3
- {upgini-1.1.317 → upgini-1.2.0a1}/pyproject.toml +7 -5
- upgini-1.2.0a1/src/upgini/__about__.py +1 -0
- {upgini-1.1.317 → upgini-1.2.0a1}/src/upgini/autofe/binary.py +71 -71
- {upgini-1.1.317 → upgini-1.2.0a1}/src/upgini/autofe/date.py +43 -25
- {upgini-1.1.317 → upgini-1.2.0a1}/src/upgini/autofe/groupby.py +22 -22
- {upgini-1.1.317 → upgini-1.2.0a1}/src/upgini/autofe/operand.py +4 -4
- {upgini-1.1.317 → upgini-1.2.0a1}/src/upgini/autofe/unary.py +47 -46
- {upgini-1.1.317 → upgini-1.2.0a1}/src/upgini/autofe/vector.py +8 -8
- {upgini-1.1.317 → upgini-1.2.0a1}/src/upgini/dataset.py +8 -3
- {upgini-1.1.317 → upgini-1.2.0a1}/src/upgini/features_enricher.py +6 -4
- {upgini-1.1.317 → upgini-1.2.0a1}/src/upgini/http.py +15 -15
- {upgini-1.1.317 → upgini-1.2.0a1}/src/upgini/lazy_import.py +14 -1
- {upgini-1.1.317 → upgini-1.2.0a1}/src/upgini/metadata.py +57 -57
- {upgini-1.1.317 → upgini-1.2.0a1}/src/upgini/normalizer/normalize_utils.py +1 -2
- {upgini-1.1.317 → upgini-1.2.0a1}/src/upgini/search_task.py +10 -4
- {upgini-1.1.317 → upgini-1.2.0a1}/src/upgini/utils/datetime_utils.py +5 -5
- {upgini-1.1.317 → upgini-1.2.0a1}/src/upgini/utils/phone_utils.py +5 -7
- {upgini-1.1.317 → upgini-1.2.0a1}/src/upgini/utils/postal_code_utils.py +1 -1
- {upgini-1.1.317 → upgini-1.2.0a1}/src/upgini/utils/target_utils.py +4 -1
- upgini-1.1.317/src/upgini/__about__.py +0 -1
- {upgini-1.1.317 → upgini-1.2.0a1}/.gitignore +0 -0
- {upgini-1.1.317 → upgini-1.2.0a1}/LICENSE +0 -0
- {upgini-1.1.317 → upgini-1.2.0a1}/README.md +0 -0
- {upgini-1.1.317 → upgini-1.2.0a1}/src/upgini/__init__.py +0 -0
- {upgini-1.1.317 → upgini-1.2.0a1}/src/upgini/ads.py +0 -0
- {upgini-1.1.317 → upgini-1.2.0a1}/src/upgini/ads_management/__init__.py +0 -0
- {upgini-1.1.317 → upgini-1.2.0a1}/src/upgini/ads_management/ads_manager.py +0 -0
- {upgini-1.1.317 → upgini-1.2.0a1}/src/upgini/autofe/__init__.py +0 -0
- {upgini-1.1.317 → upgini-1.2.0a1}/src/upgini/autofe/all_operands.py +0 -0
- {upgini-1.1.317 → upgini-1.2.0a1}/src/upgini/autofe/feature.py +0 -0
- {upgini-1.1.317 → upgini-1.2.0a1}/src/upgini/data_source/__init__.py +0 -0
- {upgini-1.1.317 → upgini-1.2.0a1}/src/upgini/data_source/data_source_publisher.py +0 -0
- {upgini-1.1.317 → upgini-1.2.0a1}/src/upgini/errors.py +0 -0
- {upgini-1.1.317 → upgini-1.2.0a1}/src/upgini/mdc/__init__.py +0 -0
- {upgini-1.1.317 → upgini-1.2.0a1}/src/upgini/mdc/context.py +0 -0
- {upgini-1.1.317 → upgini-1.2.0a1}/src/upgini/metrics.py +0 -0
- {upgini-1.1.317 → upgini-1.2.0a1}/src/upgini/normalizer/__init__.py +0 -0
- {upgini-1.1.317 → upgini-1.2.0a1}/src/upgini/resource_bundle/__init__.py +0 -0
- {upgini-1.1.317 → upgini-1.2.0a1}/src/upgini/resource_bundle/exceptions.py +0 -0
- {upgini-1.1.317 → upgini-1.2.0a1}/src/upgini/resource_bundle/strings.properties +0 -0
- {upgini-1.1.317 → upgini-1.2.0a1}/src/upgini/resource_bundle/strings_widget.properties +0 -0
- {upgini-1.1.317 → upgini-1.2.0a1}/src/upgini/sampler/__init__.py +0 -0
- {upgini-1.1.317 → upgini-1.2.0a1}/src/upgini/sampler/base.py +0 -0
- {upgini-1.1.317 → upgini-1.2.0a1}/src/upgini/sampler/random_under_sampler.py +0 -0
- {upgini-1.1.317 → upgini-1.2.0a1}/src/upgini/sampler/utils.py +0 -0
- {upgini-1.1.317 → upgini-1.2.0a1}/src/upgini/spinner.py +0 -0
- {upgini-1.1.317 → upgini-1.2.0a1}/src/upgini/utils/__init__.py +0 -0
- {upgini-1.1.317 → upgini-1.2.0a1}/src/upgini/utils/base_search_key_detector.py +0 -0
- {upgini-1.1.317 → upgini-1.2.0a1}/src/upgini/utils/blocked_time_series.py +0 -0
- {upgini-1.1.317 → upgini-1.2.0a1}/src/upgini/utils/country_utils.py +0 -0
- {upgini-1.1.317 → upgini-1.2.0a1}/src/upgini/utils/custom_loss_utils.py +0 -0
- {upgini-1.1.317 → upgini-1.2.0a1}/src/upgini/utils/cv_utils.py +0 -0
- {upgini-1.1.317 → upgini-1.2.0a1}/src/upgini/utils/deduplicate_utils.py +0 -0
- {upgini-1.1.317 → upgini-1.2.0a1}/src/upgini/utils/display_utils.py +0 -0
- {upgini-1.1.317 → upgini-1.2.0a1}/src/upgini/utils/email_utils.py +0 -0
- {upgini-1.1.317 → upgini-1.2.0a1}/src/upgini/utils/fallback_progress_bar.py +0 -0
- {upgini-1.1.317 → upgini-1.2.0a1}/src/upgini/utils/features_validator.py +0 -0
- {upgini-1.1.317 → upgini-1.2.0a1}/src/upgini/utils/format.py +0 -0
- {upgini-1.1.317 → upgini-1.2.0a1}/src/upgini/utils/ip_utils.py +0 -0
- {upgini-1.1.317 → upgini-1.2.0a1}/src/upgini/utils/progress_bar.py +0 -0
- {upgini-1.1.317 → upgini-1.2.0a1}/src/upgini/utils/sklearn_ext.py +0 -0
- {upgini-1.1.317 → upgini-1.2.0a1}/src/upgini/utils/track_info.py +0 -0
- {upgini-1.1.317 → upgini-1.2.0a1}/src/upgini/utils/warning_counter.py +0 -0
- {upgini-1.1.317 → upgini-1.2.0a1}/src/upgini/version_validator.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: upgini
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.2.0a1
|
|
4
4
|
Summary: Intelligent data search & enrichment for Machine Learning
|
|
5
5
|
Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
|
|
6
6
|
Project-URL: Homepage, https://upgini.com/
|
|
@@ -29,9 +29,9 @@ Requires-Dist: ipywidgets>=8.1.0
|
|
|
29
29
|
Requires-Dist: jarowinkler>=2.0.0
|
|
30
30
|
Requires-Dist: levenshtein>=0.25.1
|
|
31
31
|
Requires-Dist: lightgbm>=3.3.2
|
|
32
|
-
Requires-Dist: numpy
|
|
32
|
+
Requires-Dist: numpy<=1.26.4,>=1.19.0
|
|
33
33
|
Requires-Dist: pandas<3.0.0,>=1.1.0
|
|
34
|
-
Requires-Dist: pydantic<
|
|
34
|
+
Requires-Dist: pydantic<3.0.0,>1.0.0
|
|
35
35
|
Requires-Dist: pyjwt>=2.8.0
|
|
36
36
|
Requires-Dist: python-bidi==0.4.2
|
|
37
37
|
Requires-Dist: python-dateutil>=2.8.0
|
|
@@ -39,9 +39,9 @@ dependencies = [
|
|
|
39
39
|
"fastparquet>=0.8.1",
|
|
40
40
|
"ipywidgets>=8.1.0",
|
|
41
41
|
"lightgbm>=3.3.2",
|
|
42
|
-
"numpy>=1.19.0",
|
|
42
|
+
"numpy>=1.19.0,<=1.26.4",
|
|
43
43
|
"pandas>=1.1.0,<3.0.0",
|
|
44
|
-
"pydantic
|
|
44
|
+
"pydantic>1.0.0,<3.0.0",
|
|
45
45
|
"pyjwt>=2.8.0",
|
|
46
46
|
"python-dateutil>=2.8.0",
|
|
47
47
|
"python-json-logger>=2.0.2",
|
|
@@ -79,7 +79,7 @@ python = "3.10"
|
|
|
79
79
|
cov = 'pytest --cov-report=term-missing --cov-config=pyproject.toml --cov=upgini --cov=tests'
|
|
80
80
|
format = "black {args}"
|
|
81
81
|
lint = "ruff check {args}"
|
|
82
|
-
|
|
82
|
+
test_all = 'pytest -s -vv tests'
|
|
83
83
|
|
|
84
84
|
[[tool.hatch.envs.test.matrix]]
|
|
85
85
|
python = ["3.8"]
|
|
@@ -103,7 +103,8 @@ dependencies = [
|
|
|
103
103
|
# "pytest-timeout",
|
|
104
104
|
"requests-mock",
|
|
105
105
|
"pytest-datafiles",
|
|
106
|
-
"
|
|
106
|
+
"pytest-xdist",
|
|
107
|
+
"pandas~={matrix:pandas}",
|
|
107
108
|
]
|
|
108
109
|
|
|
109
110
|
[tool.black]
|
|
@@ -115,4 +116,5 @@ profile = "black"
|
|
|
115
116
|
[tool.pytest.ini_options]
|
|
116
117
|
pythonpath = [
|
|
117
118
|
"./src"
|
|
118
|
-
]
|
|
119
|
+
]
|
|
120
|
+
addopts="-n 4"
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "1.2.0a1"
|
|
@@ -9,32 +9,32 @@ from upgini.autofe.operand import PandasOperand, VectorizableMixin
|
|
|
9
9
|
|
|
10
10
|
|
|
11
11
|
class Min(PandasOperand):
|
|
12
|
-
name = "min"
|
|
13
|
-
is_binary = True
|
|
14
|
-
is_symmetrical = True
|
|
15
|
-
has_symmetry_importance = True
|
|
12
|
+
name: str = "min"
|
|
13
|
+
is_binary: bool = True
|
|
14
|
+
is_symmetrical: bool = True
|
|
15
|
+
has_symmetry_importance: bool = True
|
|
16
16
|
|
|
17
17
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
18
18
|
return np.minimum(left, right)
|
|
19
19
|
|
|
20
20
|
|
|
21
21
|
class Max(PandasOperand):
|
|
22
|
-
name = "max"
|
|
23
|
-
is_binary = True
|
|
24
|
-
is_symmetrical = True
|
|
25
|
-
has_symmetry_importance = True
|
|
22
|
+
name: str = "max"
|
|
23
|
+
is_binary: bool = True
|
|
24
|
+
is_symmetrical: bool = True
|
|
25
|
+
has_symmetry_importance: bool = True
|
|
26
26
|
|
|
27
27
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
28
28
|
return np.maximum(left, right)
|
|
29
29
|
|
|
30
30
|
|
|
31
31
|
class Add(PandasOperand, VectorizableMixin):
|
|
32
|
-
name = "+"
|
|
33
|
-
alias = "add"
|
|
34
|
-
is_binary = True
|
|
35
|
-
is_symmetrical = True
|
|
36
|
-
has_symmetry_importance = True
|
|
37
|
-
is_vectorizable = True
|
|
32
|
+
name: str = "+"
|
|
33
|
+
alias: str = "add"
|
|
34
|
+
is_binary: bool = True
|
|
35
|
+
is_symmetrical: bool = True
|
|
36
|
+
has_symmetry_importance: bool = True
|
|
37
|
+
is_vectorizable: bool = True
|
|
38
38
|
|
|
39
39
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
40
40
|
return left + right
|
|
@@ -48,12 +48,12 @@ class Add(PandasOperand, VectorizableMixin):
|
|
|
48
48
|
|
|
49
49
|
|
|
50
50
|
class Subtract(PandasOperand, VectorizableMixin):
|
|
51
|
-
name = "-"
|
|
52
|
-
alias = "sub"
|
|
53
|
-
is_binary = True
|
|
54
|
-
is_symmetrical = True
|
|
55
|
-
has_symmetry_importance = True
|
|
56
|
-
is_vectorizable = True
|
|
51
|
+
name: str = "-"
|
|
52
|
+
alias: str = "sub"
|
|
53
|
+
is_binary: bool = True
|
|
54
|
+
is_symmetrical: bool = True
|
|
55
|
+
has_symmetry_importance: bool = True
|
|
56
|
+
is_vectorizable: bool = True
|
|
57
57
|
|
|
58
58
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
59
59
|
return left - right
|
|
@@ -67,12 +67,12 @@ class Subtract(PandasOperand, VectorizableMixin):
|
|
|
67
67
|
|
|
68
68
|
|
|
69
69
|
class Multiply(PandasOperand, VectorizableMixin):
|
|
70
|
-
name = "*"
|
|
71
|
-
alias = "mul"
|
|
72
|
-
is_binary = True
|
|
73
|
-
is_symmetrical = True
|
|
74
|
-
has_symmetry_importance = True
|
|
75
|
-
is_vectorizable = True
|
|
70
|
+
name: str = "*"
|
|
71
|
+
alias: str = "mul"
|
|
72
|
+
is_binary: bool = True
|
|
73
|
+
is_symmetrical: bool = True
|
|
74
|
+
has_symmetry_importance: bool = True
|
|
75
|
+
is_vectorizable: bool = True
|
|
76
76
|
|
|
77
77
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
78
78
|
return left * right
|
|
@@ -86,12 +86,12 @@ class Multiply(PandasOperand, VectorizableMixin):
|
|
|
86
86
|
|
|
87
87
|
|
|
88
88
|
class Divide(PandasOperand, VectorizableMixin):
|
|
89
|
-
name = "/"
|
|
90
|
-
alias = "div"
|
|
91
|
-
is_binary = True
|
|
92
|
-
has_symmetry_importance = True
|
|
93
|
-
is_vectorizable = True
|
|
94
|
-
output_type = "float"
|
|
89
|
+
name: str = "/"
|
|
90
|
+
alias: str = "div"
|
|
91
|
+
is_binary: bool = True
|
|
92
|
+
has_symmetry_importance: bool = True
|
|
93
|
+
is_vectorizable: bool = True
|
|
94
|
+
output_type: Optional[str] = "float"
|
|
95
95
|
|
|
96
96
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
97
97
|
return left / right.replace(0, np.nan)
|
|
@@ -105,10 +105,10 @@ class Divide(PandasOperand, VectorizableMixin):
|
|
|
105
105
|
|
|
106
106
|
|
|
107
107
|
class Combine(PandasOperand):
|
|
108
|
-
name = "Combine"
|
|
109
|
-
is_binary = True
|
|
110
|
-
has_symmetry_importance = True
|
|
111
|
-
output_type = "object"
|
|
108
|
+
name: str = "Combine"
|
|
109
|
+
is_binary: bool = True
|
|
110
|
+
has_symmetry_importance: bool = True
|
|
111
|
+
output_type: Optional[str] = "object"
|
|
112
112
|
|
|
113
113
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
114
114
|
temp = left.astype(str) + "_" + right.astype(str)
|
|
@@ -117,13 +117,13 @@ class Combine(PandasOperand):
|
|
|
117
117
|
|
|
118
118
|
|
|
119
119
|
class CombineThenFreq(PandasOperand):
|
|
120
|
-
name = "CombineThenFreq"
|
|
121
|
-
is_binary = True
|
|
122
|
-
is_symmetrical = True
|
|
123
|
-
has_symmetry_importance = True
|
|
124
|
-
output_type = "float"
|
|
125
|
-
is_distribution_dependent = True
|
|
126
|
-
input_type = "discrete"
|
|
120
|
+
name: str = "CombineThenFreq"
|
|
121
|
+
is_binary: bool = True
|
|
122
|
+
is_symmetrical: bool = True
|
|
123
|
+
has_symmetry_importance: bool = True
|
|
124
|
+
output_type: Optional[str] = "float"
|
|
125
|
+
is_distribution_dependent: bool = True
|
|
126
|
+
input_type: Optional[str] = "discrete"
|
|
127
127
|
|
|
128
128
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
129
129
|
temp = left.astype(str) + "_" + right.astype(str)
|
|
@@ -133,11 +133,11 @@ class CombineThenFreq(PandasOperand):
|
|
|
133
133
|
|
|
134
134
|
|
|
135
135
|
class Distance(PandasOperand):
|
|
136
|
-
name = "dist"
|
|
137
|
-
is_binary = True
|
|
138
|
-
output_type = "float"
|
|
139
|
-
is_symmetrical = True
|
|
140
|
-
has_symmetry_importance = True
|
|
136
|
+
name: str = "dist"
|
|
137
|
+
is_binary: bool = True
|
|
138
|
+
output_type: Optional[str] = "float"
|
|
139
|
+
is_symmetrical: bool = True
|
|
140
|
+
has_symmetry_importance: bool = True
|
|
141
141
|
|
|
142
142
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
143
143
|
return pd.Series(
|
|
@@ -158,11 +158,11 @@ class Distance(PandasOperand):
|
|
|
158
158
|
|
|
159
159
|
# Left for backward compatibility
|
|
160
160
|
class Sim(Distance):
|
|
161
|
-
name = "sim"
|
|
162
|
-
is_binary = True
|
|
163
|
-
output_type = "float"
|
|
164
|
-
is_symmetrical = True
|
|
165
|
-
has_symmetry_importance = True
|
|
161
|
+
name: str = "sim"
|
|
162
|
+
is_binary: bool = True
|
|
163
|
+
output_type: Optional[str] = "float"
|
|
164
|
+
is_symmetrical: bool = True
|
|
165
|
+
has_symmetry_importance: bool = True
|
|
166
166
|
|
|
167
167
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
168
168
|
return 1 - super().calculate_binary(left, right)
|
|
@@ -191,12 +191,12 @@ class StringSim(PandasOperand, abc.ABC):
|
|
|
191
191
|
|
|
192
192
|
|
|
193
193
|
class JaroWinklerSim1(StringSim):
|
|
194
|
-
name = "sim_jw1"
|
|
195
|
-
is_binary = True
|
|
196
|
-
input_type = "string"
|
|
197
|
-
output_type = "float"
|
|
198
|
-
is_symmetrical = True
|
|
199
|
-
has_symmetry_importance = True
|
|
194
|
+
name: str = "sim_jw1"
|
|
195
|
+
is_binary: bool = True
|
|
196
|
+
input_type: Optional[str] = "string"
|
|
197
|
+
output_type: Optional[str] = "float"
|
|
198
|
+
is_symmetrical: bool = True
|
|
199
|
+
has_symmetry_importance: bool = True
|
|
200
200
|
|
|
201
201
|
def _prepare_value(self, value: Optional[str]) -> Optional[str]:
|
|
202
202
|
return value
|
|
@@ -206,12 +206,12 @@ class JaroWinklerSim1(StringSim):
|
|
|
206
206
|
|
|
207
207
|
|
|
208
208
|
class JaroWinklerSim2(StringSim):
|
|
209
|
-
name = "sim_jw2"
|
|
210
|
-
is_binary = True
|
|
211
|
-
input_type = "string"
|
|
212
|
-
output_type = "float"
|
|
213
|
-
is_symmetrical = True
|
|
214
|
-
has_symmetry_importance = True
|
|
209
|
+
name: str = "sim_jw2"
|
|
210
|
+
is_binary: bool = True
|
|
211
|
+
input_type: Optional[str] = "string"
|
|
212
|
+
output_type: Optional[str] = "float"
|
|
213
|
+
is_symmetrical: bool = True
|
|
214
|
+
has_symmetry_importance: bool = True
|
|
215
215
|
|
|
216
216
|
def _prepare_value(self, value: Optional[str]) -> Optional[str]:
|
|
217
217
|
return value[::-1] if value is not None else None
|
|
@@ -221,12 +221,12 @@ class JaroWinklerSim2(StringSim):
|
|
|
221
221
|
|
|
222
222
|
|
|
223
223
|
class LevenshteinSim(StringSim):
|
|
224
|
-
name = "sim_lv"
|
|
225
|
-
is_binary = True
|
|
226
|
-
input_type = "string"
|
|
227
|
-
output_type = "float"
|
|
228
|
-
is_symmetrical = True
|
|
229
|
-
has_symmetry_importance = True
|
|
224
|
+
name: str = "sim_lv"
|
|
225
|
+
is_binary: bool = True
|
|
226
|
+
input_type: Optional[str] = "string"
|
|
227
|
+
output_type: Optional[str] = "float"
|
|
228
|
+
is_symmetrical: bool = True
|
|
229
|
+
has_symmetry_importance: bool = True
|
|
230
230
|
|
|
231
231
|
def _prepare_value(self, value: Optional[str]) -> Optional[str]:
|
|
232
232
|
return value
|
|
@@ -5,11 +5,16 @@ from typing import Any, Dict, List, Optional, Union
|
|
|
5
5
|
import numpy as np
|
|
6
6
|
import pandas as pd
|
|
7
7
|
from pandas.core.arrays.timedeltas import TimedeltaArray
|
|
8
|
-
from pydantic import BaseModel,
|
|
8
|
+
from pydantic import BaseModel, __version__ as pydantic_version
|
|
9
9
|
|
|
10
10
|
from upgini.autofe.operand import PandasOperand
|
|
11
11
|
|
|
12
12
|
|
|
13
|
+
def get_pydantic_version():
|
|
14
|
+
major_version = int(pydantic_version.split('.')[0])
|
|
15
|
+
return major_version
|
|
16
|
+
|
|
17
|
+
|
|
13
18
|
class DateDiffMixin(BaseModel):
|
|
14
19
|
diff_unit: str = "D"
|
|
15
20
|
left_unit: Optional[str] = None
|
|
@@ -39,10 +44,10 @@ class DateDiffMixin(BaseModel):
|
|
|
39
44
|
|
|
40
45
|
|
|
41
46
|
class DateDiff(PandasOperand, DateDiffMixin):
|
|
42
|
-
name = "date_diff"
|
|
43
|
-
alias = "date_diff_type1"
|
|
44
|
-
is_binary = True
|
|
45
|
-
has_symmetry_importance = True
|
|
47
|
+
name: str = "date_diff"
|
|
48
|
+
alias: Optional[str] = "date_diff_type1"
|
|
49
|
+
is_binary: bool = True
|
|
50
|
+
has_symmetry_importance: bool = True
|
|
46
51
|
|
|
47
52
|
replace_negative: bool = False
|
|
48
53
|
|
|
@@ -71,9 +76,9 @@ class DateDiff(PandasOperand, DateDiffMixin):
|
|
|
71
76
|
|
|
72
77
|
|
|
73
78
|
class DateDiffType2(PandasOperand, DateDiffMixin):
|
|
74
|
-
name = "date_diff_type2"
|
|
75
|
-
is_binary = True
|
|
76
|
-
has_symmetry_importance = True
|
|
79
|
+
name: str = "date_diff_type2"
|
|
80
|
+
is_binary: bool = True
|
|
81
|
+
has_symmetry_importance: bool = True
|
|
77
82
|
|
|
78
83
|
def get_params(self) -> Dict[str, Optional[str]]:
|
|
79
84
|
res = super().get_params()
|
|
@@ -105,8 +110,8 @@ _count_aggregations = ["nunique", "count"]
|
|
|
105
110
|
|
|
106
111
|
|
|
107
112
|
class DateListDiff(PandasOperand, DateDiffMixin):
|
|
108
|
-
is_binary = True
|
|
109
|
-
has_symmetry_importance = True
|
|
113
|
+
is_binary: bool = True
|
|
114
|
+
has_symmetry_importance: bool = True
|
|
110
115
|
|
|
111
116
|
aggregation: str
|
|
112
117
|
replace_negative: bool = False
|
|
@@ -166,8 +171,8 @@ class DateListDiff(PandasOperand, DateDiffMixin):
|
|
|
166
171
|
|
|
167
172
|
|
|
168
173
|
class DateListDiffBounded(DateListDiff):
|
|
169
|
-
lower_bound: Optional[int]
|
|
170
|
-
upper_bound: Optional[int]
|
|
174
|
+
lower_bound: Optional[int] = None
|
|
175
|
+
upper_bound: Optional[int] = None
|
|
171
176
|
|
|
172
177
|
def __init__(self, **data: Any) -> None:
|
|
173
178
|
if "name" not in data:
|
|
@@ -192,8 +197,8 @@ class DateListDiffBounded(DateListDiff):
|
|
|
192
197
|
|
|
193
198
|
|
|
194
199
|
class DatePercentileBase(PandasOperand, abc.ABC):
|
|
195
|
-
is_binary = True
|
|
196
|
-
output_type = "float"
|
|
200
|
+
is_binary: bool = True
|
|
201
|
+
output_type: Optional[str] = "float"
|
|
197
202
|
|
|
198
203
|
date_unit: Optional[str] = None
|
|
199
204
|
|
|
@@ -227,12 +232,12 @@ class DatePercentileBase(PandasOperand, abc.ABC):
|
|
|
227
232
|
|
|
228
233
|
|
|
229
234
|
class DatePercentile(DatePercentileBase):
|
|
230
|
-
name = "date_per"
|
|
231
|
-
alias = "date_per_method1"
|
|
235
|
+
name: str = "date_per"
|
|
236
|
+
alias: Optional[str] = "date_per_method1"
|
|
232
237
|
|
|
233
|
-
zero_month: Optional[int]
|
|
234
|
-
zero_year: Optional[int]
|
|
235
|
-
zero_bounds: Optional[List[float]]
|
|
238
|
+
zero_month: Optional[int] = None
|
|
239
|
+
zero_year: Optional[int] = None
|
|
240
|
+
zero_bounds: Optional[List[float]] = None
|
|
236
241
|
step: int = 30
|
|
237
242
|
|
|
238
243
|
def get_params(self) -> Dict[str, Optional[str]]:
|
|
@@ -247,12 +252,25 @@ class DatePercentile(DatePercentileBase):
|
|
|
247
252
|
)
|
|
248
253
|
return res
|
|
249
254
|
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
255
|
+
# Check Pydantic version
|
|
256
|
+
if get_pydantic_version() >= 2:
|
|
257
|
+
# Use @field_validator for Pydantic 2.x
|
|
258
|
+
from pydantic import field_validator
|
|
259
|
+
|
|
260
|
+
@field_validator('zero_bounds', mode='before')
|
|
261
|
+
def parse_zero_bounds(cls, value):
|
|
262
|
+
if isinstance(value, str):
|
|
263
|
+
return json.loads(value)
|
|
264
|
+
return value
|
|
265
|
+
else:
|
|
266
|
+
# Use @validator for Pydantic 1.x
|
|
267
|
+
from pydantic import validator
|
|
268
|
+
|
|
269
|
+
@validator('zero_bounds', pre=True)
|
|
270
|
+
def parse_zero_bounds(cls, value):
|
|
271
|
+
if isinstance(value, str):
|
|
272
|
+
return json.loads(value)
|
|
253
273
|
return value
|
|
254
|
-
elif isinstance(value, str):
|
|
255
|
-
return json.loads(value)
|
|
256
274
|
|
|
257
275
|
def _get_bounds(self, date_col: pd.Series) -> pd.Series:
|
|
258
276
|
months = date_col.dt.month
|
|
@@ -265,7 +283,7 @@ class DatePercentile(DatePercentileBase):
|
|
|
265
283
|
|
|
266
284
|
|
|
267
285
|
class DatePercentileMethod2(DatePercentileBase):
|
|
268
|
-
name = "date_per_method2"
|
|
286
|
+
name: str = "date_per_method2"
|
|
269
287
|
|
|
270
288
|
def _get_bounds(self, date_col: pd.Series) -> pd.Series:
|
|
271
289
|
pass
|
|
@@ -7,9 +7,9 @@ from upgini.autofe.operand import PandasOperand, VectorizableMixin
|
|
|
7
7
|
|
|
8
8
|
class GroupByThenAgg(PandasOperand, VectorizableMixin):
|
|
9
9
|
agg: Optional[str]
|
|
10
|
-
is_vectorizable = True
|
|
11
|
-
is_grouping = True
|
|
12
|
-
is_distribution_dependent = True
|
|
10
|
+
is_vectorizable: bool = True
|
|
11
|
+
is_grouping: bool = True
|
|
12
|
+
is_distribution_dependent: bool = True
|
|
13
13
|
|
|
14
14
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
15
15
|
temp = left.groupby(right).agg(self.agg)
|
|
@@ -24,17 +24,17 @@ class GroupByThenAgg(PandasOperand, VectorizableMixin):
|
|
|
24
24
|
|
|
25
25
|
|
|
26
26
|
class GroupByThenMedian(GroupByThenAgg):
|
|
27
|
-
name = "GroupByThenMedian"
|
|
28
|
-
pandas_agg = "median"
|
|
29
|
-
is_distribution_dependent = True
|
|
27
|
+
name: str = "GroupByThenMedian"
|
|
28
|
+
pandas_agg: str = "median"
|
|
29
|
+
is_distribution_dependent: bool = True
|
|
30
30
|
|
|
31
31
|
|
|
32
32
|
class GroupByThenRank(PandasOperand, VectorizableMixin):
|
|
33
|
-
name = "GroupByThenRank"
|
|
34
|
-
is_vectorizable = True
|
|
35
|
-
is_grouping = True
|
|
36
|
-
output_type = "float"
|
|
37
|
-
is_distribution_dependent = True
|
|
33
|
+
name: str = "GroupByThenRank"
|
|
34
|
+
is_vectorizable: bool = True
|
|
35
|
+
is_grouping: bool = True
|
|
36
|
+
output_type: Optional[str] = "float"
|
|
37
|
+
is_distribution_dependent: bool = True
|
|
38
38
|
|
|
39
39
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
40
40
|
temp = pd.DataFrame(left[~right.isna()].groupby(right).rank(ascending=True, pct=True)).reset_index()
|
|
@@ -49,12 +49,12 @@ class GroupByThenRank(PandasOperand, VectorizableMixin):
|
|
|
49
49
|
|
|
50
50
|
|
|
51
51
|
class GroupByThenNUnique(PandasOperand, VectorizableMixin):
|
|
52
|
-
name = "GroupByThenNUnique"
|
|
53
|
-
is_vectorizable = True
|
|
54
|
-
is_grouping = True
|
|
55
|
-
output_type = "int"
|
|
56
|
-
is_distribution_dependent = True
|
|
57
|
-
input_type = "discrete"
|
|
52
|
+
name: str = "GroupByThenNUnique"
|
|
53
|
+
is_vectorizable: bool = True
|
|
54
|
+
is_grouping: bool = True
|
|
55
|
+
output_type: Optional[str] = "int"
|
|
56
|
+
is_distribution_dependent: bool = True
|
|
57
|
+
input_type: Optional[str] = "discrete"
|
|
58
58
|
|
|
59
59
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
60
60
|
nunique = left.groupby(right).nunique()
|
|
@@ -69,11 +69,11 @@ class GroupByThenNUnique(PandasOperand, VectorizableMixin):
|
|
|
69
69
|
|
|
70
70
|
|
|
71
71
|
class GroupByThenFreq(PandasOperand):
|
|
72
|
-
name = "GroupByThenFreq"
|
|
73
|
-
is_grouping = True
|
|
74
|
-
output_type = "float"
|
|
75
|
-
is_distribution_dependent = True
|
|
76
|
-
input_type = "discrete"
|
|
72
|
+
name: str = "GroupByThenFreq"
|
|
73
|
+
is_grouping: bool = True
|
|
74
|
+
output_type: Optional[str] = "float"
|
|
75
|
+
is_distribution_dependent: bool = True
|
|
76
|
+
input_type: Optional[str] = "discrete"
|
|
77
77
|
|
|
78
78
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
79
79
|
def _f(x):
|
|
@@ -8,19 +8,19 @@ from pydantic import BaseModel
|
|
|
8
8
|
|
|
9
9
|
class Operand(BaseModel):
|
|
10
10
|
name: str
|
|
11
|
-
alias: Optional[str]
|
|
11
|
+
alias: Optional[str] = None
|
|
12
12
|
is_unary: bool = False
|
|
13
13
|
is_symmetrical: bool = False
|
|
14
14
|
has_symmetry_importance: bool = False
|
|
15
|
-
input_type: Optional[str]
|
|
16
|
-
output_type: Optional[str]
|
|
15
|
+
input_type: Optional[str] = None
|
|
16
|
+
output_type: Optional[str] = None
|
|
17
17
|
is_categorical: bool = False
|
|
18
18
|
is_vectorizable: bool = False
|
|
19
19
|
is_grouping: bool = False
|
|
20
20
|
is_binary: bool = False
|
|
21
21
|
is_vector: bool = False
|
|
22
22
|
is_distribution_dependent: bool = False
|
|
23
|
-
params: Optional[Dict[str, str]]
|
|
23
|
+
params: Optional[Dict[str, str]] = None
|
|
24
24
|
|
|
25
25
|
def set_params(self, params: Dict[str, str]):
|
|
26
26
|
self.params = params
|