upgini 1.1.317__py3-none-any.whl → 1.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/__about__.py +1 -1
- upgini/autofe/binary.py +71 -71
- upgini/autofe/date.py +43 -25
- upgini/autofe/groupby.py +22 -22
- upgini/autofe/operand.py +4 -4
- upgini/autofe/unary.py +65 -50
- upgini/autofe/vector.py +8 -8
- upgini/dataset.py +8 -3
- upgini/features_enricher.py +6 -4
- upgini/http.py +15 -15
- upgini/lazy_import.py +14 -1
- upgini/metadata.py +57 -57
- upgini/normalizer/normalize_utils.py +1 -2
- upgini/search_task.py +10 -4
- upgini/utils/datetime_utils.py +5 -5
- upgini/utils/phone_utils.py +5 -7
- upgini/utils/postal_code_utils.py +1 -1
- upgini/utils/target_utils.py +4 -1
- {upgini-1.1.317.dist-info → upgini-1.2.1.dist-info}/METADATA +3 -3
- {upgini-1.1.317.dist-info → upgini-1.2.1.dist-info}/RECORD +22 -22
- {upgini-1.1.317.dist-info → upgini-1.2.1.dist-info}/WHEEL +0 -0
- {upgini-1.1.317.dist-info → upgini-1.2.1.dist-info}/licenses/LICENSE +0 -0
upgini/__about__.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "1.1
|
|
1
|
+
__version__ = "1.2.1"
|
upgini/autofe/binary.py
CHANGED
|
@@ -9,32 +9,32 @@ from upgini.autofe.operand import PandasOperand, VectorizableMixin
|
|
|
9
9
|
|
|
10
10
|
|
|
11
11
|
class Min(PandasOperand):
|
|
12
|
-
name = "min"
|
|
13
|
-
is_binary = True
|
|
14
|
-
is_symmetrical = True
|
|
15
|
-
has_symmetry_importance = True
|
|
12
|
+
name: str = "min"
|
|
13
|
+
is_binary: bool = True
|
|
14
|
+
is_symmetrical: bool = True
|
|
15
|
+
has_symmetry_importance: bool = True
|
|
16
16
|
|
|
17
17
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
18
18
|
return np.minimum(left, right)
|
|
19
19
|
|
|
20
20
|
|
|
21
21
|
class Max(PandasOperand):
|
|
22
|
-
name = "max"
|
|
23
|
-
is_binary = True
|
|
24
|
-
is_symmetrical = True
|
|
25
|
-
has_symmetry_importance = True
|
|
22
|
+
name: str = "max"
|
|
23
|
+
is_binary: bool = True
|
|
24
|
+
is_symmetrical: bool = True
|
|
25
|
+
has_symmetry_importance: bool = True
|
|
26
26
|
|
|
27
27
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
28
28
|
return np.maximum(left, right)
|
|
29
29
|
|
|
30
30
|
|
|
31
31
|
class Add(PandasOperand, VectorizableMixin):
|
|
32
|
-
name = "+"
|
|
33
|
-
alias = "add"
|
|
34
|
-
is_binary = True
|
|
35
|
-
is_symmetrical = True
|
|
36
|
-
has_symmetry_importance = True
|
|
37
|
-
is_vectorizable = True
|
|
32
|
+
name: str = "+"
|
|
33
|
+
alias: str = "add"
|
|
34
|
+
is_binary: bool = True
|
|
35
|
+
is_symmetrical: bool = True
|
|
36
|
+
has_symmetry_importance: bool = True
|
|
37
|
+
is_vectorizable: bool = True
|
|
38
38
|
|
|
39
39
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
40
40
|
return left + right
|
|
@@ -48,12 +48,12 @@ class Add(PandasOperand, VectorizableMixin):
|
|
|
48
48
|
|
|
49
49
|
|
|
50
50
|
class Subtract(PandasOperand, VectorizableMixin):
|
|
51
|
-
name = "-"
|
|
52
|
-
alias = "sub"
|
|
53
|
-
is_binary = True
|
|
54
|
-
is_symmetrical = True
|
|
55
|
-
has_symmetry_importance = True
|
|
56
|
-
is_vectorizable = True
|
|
51
|
+
name: str = "-"
|
|
52
|
+
alias: str = "sub"
|
|
53
|
+
is_binary: bool = True
|
|
54
|
+
is_symmetrical: bool = True
|
|
55
|
+
has_symmetry_importance: bool = True
|
|
56
|
+
is_vectorizable: bool = True
|
|
57
57
|
|
|
58
58
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
59
59
|
return left - right
|
|
@@ -67,12 +67,12 @@ class Subtract(PandasOperand, VectorizableMixin):
|
|
|
67
67
|
|
|
68
68
|
|
|
69
69
|
class Multiply(PandasOperand, VectorizableMixin):
|
|
70
|
-
name = "*"
|
|
71
|
-
alias = "mul"
|
|
72
|
-
is_binary = True
|
|
73
|
-
is_symmetrical = True
|
|
74
|
-
has_symmetry_importance = True
|
|
75
|
-
is_vectorizable = True
|
|
70
|
+
name: str = "*"
|
|
71
|
+
alias: str = "mul"
|
|
72
|
+
is_binary: bool = True
|
|
73
|
+
is_symmetrical: bool = True
|
|
74
|
+
has_symmetry_importance: bool = True
|
|
75
|
+
is_vectorizable: bool = True
|
|
76
76
|
|
|
77
77
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
78
78
|
return left * right
|
|
@@ -86,12 +86,12 @@ class Multiply(PandasOperand, VectorizableMixin):
|
|
|
86
86
|
|
|
87
87
|
|
|
88
88
|
class Divide(PandasOperand, VectorizableMixin):
|
|
89
|
-
name = "/"
|
|
90
|
-
alias = "div"
|
|
91
|
-
is_binary = True
|
|
92
|
-
has_symmetry_importance = True
|
|
93
|
-
is_vectorizable = True
|
|
94
|
-
output_type = "float"
|
|
89
|
+
name: str = "/"
|
|
90
|
+
alias: str = "div"
|
|
91
|
+
is_binary: bool = True
|
|
92
|
+
has_symmetry_importance: bool = True
|
|
93
|
+
is_vectorizable: bool = True
|
|
94
|
+
output_type: Optional[str] = "float"
|
|
95
95
|
|
|
96
96
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
97
97
|
return left / right.replace(0, np.nan)
|
|
@@ -105,10 +105,10 @@ class Divide(PandasOperand, VectorizableMixin):
|
|
|
105
105
|
|
|
106
106
|
|
|
107
107
|
class Combine(PandasOperand):
|
|
108
|
-
name = "Combine"
|
|
109
|
-
is_binary = True
|
|
110
|
-
has_symmetry_importance = True
|
|
111
|
-
output_type = "object"
|
|
108
|
+
name: str = "Combine"
|
|
109
|
+
is_binary: bool = True
|
|
110
|
+
has_symmetry_importance: bool = True
|
|
111
|
+
output_type: Optional[str] = "object"
|
|
112
112
|
|
|
113
113
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
114
114
|
temp = left.astype(str) + "_" + right.astype(str)
|
|
@@ -117,13 +117,13 @@ class Combine(PandasOperand):
|
|
|
117
117
|
|
|
118
118
|
|
|
119
119
|
class CombineThenFreq(PandasOperand):
|
|
120
|
-
name = "CombineThenFreq"
|
|
121
|
-
is_binary = True
|
|
122
|
-
is_symmetrical = True
|
|
123
|
-
has_symmetry_importance = True
|
|
124
|
-
output_type = "float"
|
|
125
|
-
is_distribution_dependent = True
|
|
126
|
-
input_type = "discrete"
|
|
120
|
+
name: str = "CombineThenFreq"
|
|
121
|
+
is_binary: bool = True
|
|
122
|
+
is_symmetrical: bool = True
|
|
123
|
+
has_symmetry_importance: bool = True
|
|
124
|
+
output_type: Optional[str] = "float"
|
|
125
|
+
is_distribution_dependent: bool = True
|
|
126
|
+
input_type: Optional[str] = "discrete"
|
|
127
127
|
|
|
128
128
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
129
129
|
temp = left.astype(str) + "_" + right.astype(str)
|
|
@@ -133,11 +133,11 @@ class CombineThenFreq(PandasOperand):
|
|
|
133
133
|
|
|
134
134
|
|
|
135
135
|
class Distance(PandasOperand):
|
|
136
|
-
name = "dist"
|
|
137
|
-
is_binary = True
|
|
138
|
-
output_type = "float"
|
|
139
|
-
is_symmetrical = True
|
|
140
|
-
has_symmetry_importance = True
|
|
136
|
+
name: str = "dist"
|
|
137
|
+
is_binary: bool = True
|
|
138
|
+
output_type: Optional[str] = "float"
|
|
139
|
+
is_symmetrical: bool = True
|
|
140
|
+
has_symmetry_importance: bool = True
|
|
141
141
|
|
|
142
142
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
143
143
|
return pd.Series(
|
|
@@ -158,11 +158,11 @@ class Distance(PandasOperand):
|
|
|
158
158
|
|
|
159
159
|
# Left for backward compatibility
|
|
160
160
|
class Sim(Distance):
|
|
161
|
-
name = "sim"
|
|
162
|
-
is_binary = True
|
|
163
|
-
output_type = "float"
|
|
164
|
-
is_symmetrical = True
|
|
165
|
-
has_symmetry_importance = True
|
|
161
|
+
name: str = "sim"
|
|
162
|
+
is_binary: bool = True
|
|
163
|
+
output_type: Optional[str] = "float"
|
|
164
|
+
is_symmetrical: bool = True
|
|
165
|
+
has_symmetry_importance: bool = True
|
|
166
166
|
|
|
167
167
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
168
168
|
return 1 - super().calculate_binary(left, right)
|
|
@@ -191,12 +191,12 @@ class StringSim(PandasOperand, abc.ABC):
|
|
|
191
191
|
|
|
192
192
|
|
|
193
193
|
class JaroWinklerSim1(StringSim):
|
|
194
|
-
name = "sim_jw1"
|
|
195
|
-
is_binary = True
|
|
196
|
-
input_type = "string"
|
|
197
|
-
output_type = "float"
|
|
198
|
-
is_symmetrical = True
|
|
199
|
-
has_symmetry_importance = True
|
|
194
|
+
name: str = "sim_jw1"
|
|
195
|
+
is_binary: bool = True
|
|
196
|
+
input_type: Optional[str] = "string"
|
|
197
|
+
output_type: Optional[str] = "float"
|
|
198
|
+
is_symmetrical: bool = True
|
|
199
|
+
has_symmetry_importance: bool = True
|
|
200
200
|
|
|
201
201
|
def _prepare_value(self, value: Optional[str]) -> Optional[str]:
|
|
202
202
|
return value
|
|
@@ -206,12 +206,12 @@ class JaroWinklerSim1(StringSim):
|
|
|
206
206
|
|
|
207
207
|
|
|
208
208
|
class JaroWinklerSim2(StringSim):
|
|
209
|
-
name = "sim_jw2"
|
|
210
|
-
is_binary = True
|
|
211
|
-
input_type = "string"
|
|
212
|
-
output_type = "float"
|
|
213
|
-
is_symmetrical = True
|
|
214
|
-
has_symmetry_importance = True
|
|
209
|
+
name: str = "sim_jw2"
|
|
210
|
+
is_binary: bool = True
|
|
211
|
+
input_type: Optional[str] = "string"
|
|
212
|
+
output_type: Optional[str] = "float"
|
|
213
|
+
is_symmetrical: bool = True
|
|
214
|
+
has_symmetry_importance: bool = True
|
|
215
215
|
|
|
216
216
|
def _prepare_value(self, value: Optional[str]) -> Optional[str]:
|
|
217
217
|
return value[::-1] if value is not None else None
|
|
@@ -221,12 +221,12 @@ class JaroWinklerSim2(StringSim):
|
|
|
221
221
|
|
|
222
222
|
|
|
223
223
|
class LevenshteinSim(StringSim):
|
|
224
|
-
name = "sim_lv"
|
|
225
|
-
is_binary = True
|
|
226
|
-
input_type = "string"
|
|
227
|
-
output_type = "float"
|
|
228
|
-
is_symmetrical = True
|
|
229
|
-
has_symmetry_importance = True
|
|
224
|
+
name: str = "sim_lv"
|
|
225
|
+
is_binary: bool = True
|
|
226
|
+
input_type: Optional[str] = "string"
|
|
227
|
+
output_type: Optional[str] = "float"
|
|
228
|
+
is_symmetrical: bool = True
|
|
229
|
+
has_symmetry_importance: bool = True
|
|
230
230
|
|
|
231
231
|
def _prepare_value(self, value: Optional[str]) -> Optional[str]:
|
|
232
232
|
return value
|
upgini/autofe/date.py
CHANGED
|
@@ -5,11 +5,16 @@ from typing import Any, Dict, List, Optional, Union
|
|
|
5
5
|
import numpy as np
|
|
6
6
|
import pandas as pd
|
|
7
7
|
from pandas.core.arrays.timedeltas import TimedeltaArray
|
|
8
|
-
from pydantic import BaseModel,
|
|
8
|
+
from pydantic import BaseModel, __version__ as pydantic_version
|
|
9
9
|
|
|
10
10
|
from upgini.autofe.operand import PandasOperand
|
|
11
11
|
|
|
12
12
|
|
|
13
|
+
def get_pydantic_version():
|
|
14
|
+
major_version = int(pydantic_version.split('.')[0])
|
|
15
|
+
return major_version
|
|
16
|
+
|
|
17
|
+
|
|
13
18
|
class DateDiffMixin(BaseModel):
|
|
14
19
|
diff_unit: str = "D"
|
|
15
20
|
left_unit: Optional[str] = None
|
|
@@ -39,10 +44,10 @@ class DateDiffMixin(BaseModel):
|
|
|
39
44
|
|
|
40
45
|
|
|
41
46
|
class DateDiff(PandasOperand, DateDiffMixin):
|
|
42
|
-
name = "date_diff"
|
|
43
|
-
alias = "date_diff_type1"
|
|
44
|
-
is_binary = True
|
|
45
|
-
has_symmetry_importance = True
|
|
47
|
+
name: str = "date_diff"
|
|
48
|
+
alias: Optional[str] = "date_diff_type1"
|
|
49
|
+
is_binary: bool = True
|
|
50
|
+
has_symmetry_importance: bool = True
|
|
46
51
|
|
|
47
52
|
replace_negative: bool = False
|
|
48
53
|
|
|
@@ -71,9 +76,9 @@ class DateDiff(PandasOperand, DateDiffMixin):
|
|
|
71
76
|
|
|
72
77
|
|
|
73
78
|
class DateDiffType2(PandasOperand, DateDiffMixin):
|
|
74
|
-
name = "date_diff_type2"
|
|
75
|
-
is_binary = True
|
|
76
|
-
has_symmetry_importance = True
|
|
79
|
+
name: str = "date_diff_type2"
|
|
80
|
+
is_binary: bool = True
|
|
81
|
+
has_symmetry_importance: bool = True
|
|
77
82
|
|
|
78
83
|
def get_params(self) -> Dict[str, Optional[str]]:
|
|
79
84
|
res = super().get_params()
|
|
@@ -105,8 +110,8 @@ _count_aggregations = ["nunique", "count"]
|
|
|
105
110
|
|
|
106
111
|
|
|
107
112
|
class DateListDiff(PandasOperand, DateDiffMixin):
|
|
108
|
-
is_binary = True
|
|
109
|
-
has_symmetry_importance = True
|
|
113
|
+
is_binary: bool = True
|
|
114
|
+
has_symmetry_importance: bool = True
|
|
110
115
|
|
|
111
116
|
aggregation: str
|
|
112
117
|
replace_negative: bool = False
|
|
@@ -166,8 +171,8 @@ class DateListDiff(PandasOperand, DateDiffMixin):
|
|
|
166
171
|
|
|
167
172
|
|
|
168
173
|
class DateListDiffBounded(DateListDiff):
|
|
169
|
-
lower_bound: Optional[int]
|
|
170
|
-
upper_bound: Optional[int]
|
|
174
|
+
lower_bound: Optional[int] = None
|
|
175
|
+
upper_bound: Optional[int] = None
|
|
171
176
|
|
|
172
177
|
def __init__(self, **data: Any) -> None:
|
|
173
178
|
if "name" not in data:
|
|
@@ -192,8 +197,8 @@ class DateListDiffBounded(DateListDiff):
|
|
|
192
197
|
|
|
193
198
|
|
|
194
199
|
class DatePercentileBase(PandasOperand, abc.ABC):
|
|
195
|
-
is_binary = True
|
|
196
|
-
output_type = "float"
|
|
200
|
+
is_binary: bool = True
|
|
201
|
+
output_type: Optional[str] = "float"
|
|
197
202
|
|
|
198
203
|
date_unit: Optional[str] = None
|
|
199
204
|
|
|
@@ -227,12 +232,12 @@ class DatePercentileBase(PandasOperand, abc.ABC):
|
|
|
227
232
|
|
|
228
233
|
|
|
229
234
|
class DatePercentile(DatePercentileBase):
|
|
230
|
-
name = "date_per"
|
|
231
|
-
alias = "date_per_method1"
|
|
235
|
+
name: str = "date_per"
|
|
236
|
+
alias: Optional[str] = "date_per_method1"
|
|
232
237
|
|
|
233
|
-
zero_month: Optional[int]
|
|
234
|
-
zero_year: Optional[int]
|
|
235
|
-
zero_bounds: Optional[List[float]]
|
|
238
|
+
zero_month: Optional[int] = None
|
|
239
|
+
zero_year: Optional[int] = None
|
|
240
|
+
zero_bounds: Optional[List[float]] = None
|
|
236
241
|
step: int = 30
|
|
237
242
|
|
|
238
243
|
def get_params(self) -> Dict[str, Optional[str]]:
|
|
@@ -247,12 +252,25 @@ class DatePercentile(DatePercentileBase):
|
|
|
247
252
|
)
|
|
248
253
|
return res
|
|
249
254
|
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
255
|
+
# Check Pydantic version
|
|
256
|
+
if get_pydantic_version() >= 2:
|
|
257
|
+
# Use @field_validator for Pydantic 2.x
|
|
258
|
+
from pydantic import field_validator
|
|
259
|
+
|
|
260
|
+
@field_validator('zero_bounds', mode='before')
|
|
261
|
+
def parse_zero_bounds(cls, value):
|
|
262
|
+
if isinstance(value, str):
|
|
263
|
+
return json.loads(value)
|
|
264
|
+
return value
|
|
265
|
+
else:
|
|
266
|
+
# Use @validator for Pydantic 1.x
|
|
267
|
+
from pydantic import validator
|
|
268
|
+
|
|
269
|
+
@validator('zero_bounds', pre=True)
|
|
270
|
+
def parse_zero_bounds(cls, value):
|
|
271
|
+
if isinstance(value, str):
|
|
272
|
+
return json.loads(value)
|
|
253
273
|
return value
|
|
254
|
-
elif isinstance(value, str):
|
|
255
|
-
return json.loads(value)
|
|
256
274
|
|
|
257
275
|
def _get_bounds(self, date_col: pd.Series) -> pd.Series:
|
|
258
276
|
months = date_col.dt.month
|
|
@@ -265,7 +283,7 @@ class DatePercentile(DatePercentileBase):
|
|
|
265
283
|
|
|
266
284
|
|
|
267
285
|
class DatePercentileMethod2(DatePercentileBase):
|
|
268
|
-
name = "date_per_method2"
|
|
286
|
+
name: str = "date_per_method2"
|
|
269
287
|
|
|
270
288
|
def _get_bounds(self, date_col: pd.Series) -> pd.Series:
|
|
271
289
|
pass
|
upgini/autofe/groupby.py
CHANGED
|
@@ -7,9 +7,9 @@ from upgini.autofe.operand import PandasOperand, VectorizableMixin
|
|
|
7
7
|
|
|
8
8
|
class GroupByThenAgg(PandasOperand, VectorizableMixin):
|
|
9
9
|
agg: Optional[str]
|
|
10
|
-
is_vectorizable = True
|
|
11
|
-
is_grouping = True
|
|
12
|
-
is_distribution_dependent = True
|
|
10
|
+
is_vectorizable: bool = True
|
|
11
|
+
is_grouping: bool = True
|
|
12
|
+
is_distribution_dependent: bool = True
|
|
13
13
|
|
|
14
14
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
15
15
|
temp = left.groupby(right).agg(self.agg)
|
|
@@ -24,17 +24,17 @@ class GroupByThenAgg(PandasOperand, VectorizableMixin):
|
|
|
24
24
|
|
|
25
25
|
|
|
26
26
|
class GroupByThenMedian(GroupByThenAgg):
|
|
27
|
-
name = "GroupByThenMedian"
|
|
28
|
-
pandas_agg = "median"
|
|
29
|
-
is_distribution_dependent = True
|
|
27
|
+
name: str = "GroupByThenMedian"
|
|
28
|
+
pandas_agg: str = "median"
|
|
29
|
+
is_distribution_dependent: bool = True
|
|
30
30
|
|
|
31
31
|
|
|
32
32
|
class GroupByThenRank(PandasOperand, VectorizableMixin):
|
|
33
|
-
name = "GroupByThenRank"
|
|
34
|
-
is_vectorizable = True
|
|
35
|
-
is_grouping = True
|
|
36
|
-
output_type = "float"
|
|
37
|
-
is_distribution_dependent = True
|
|
33
|
+
name: str = "GroupByThenRank"
|
|
34
|
+
is_vectorizable: bool = True
|
|
35
|
+
is_grouping: bool = True
|
|
36
|
+
output_type: Optional[str] = "float"
|
|
37
|
+
is_distribution_dependent: bool = True
|
|
38
38
|
|
|
39
39
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
40
40
|
temp = pd.DataFrame(left[~right.isna()].groupby(right).rank(ascending=True, pct=True)).reset_index()
|
|
@@ -49,12 +49,12 @@ class GroupByThenRank(PandasOperand, VectorizableMixin):
|
|
|
49
49
|
|
|
50
50
|
|
|
51
51
|
class GroupByThenNUnique(PandasOperand, VectorizableMixin):
|
|
52
|
-
name = "GroupByThenNUnique"
|
|
53
|
-
is_vectorizable = True
|
|
54
|
-
is_grouping = True
|
|
55
|
-
output_type = "int"
|
|
56
|
-
is_distribution_dependent = True
|
|
57
|
-
input_type = "discrete"
|
|
52
|
+
name: str = "GroupByThenNUnique"
|
|
53
|
+
is_vectorizable: bool = True
|
|
54
|
+
is_grouping: bool = True
|
|
55
|
+
output_type: Optional[str] = "int"
|
|
56
|
+
is_distribution_dependent: bool = True
|
|
57
|
+
input_type: Optional[str] = "discrete"
|
|
58
58
|
|
|
59
59
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
60
60
|
nunique = left.groupby(right).nunique()
|
|
@@ -69,11 +69,11 @@ class GroupByThenNUnique(PandasOperand, VectorizableMixin):
|
|
|
69
69
|
|
|
70
70
|
|
|
71
71
|
class GroupByThenFreq(PandasOperand):
|
|
72
|
-
name = "GroupByThenFreq"
|
|
73
|
-
is_grouping = True
|
|
74
|
-
output_type = "float"
|
|
75
|
-
is_distribution_dependent = True
|
|
76
|
-
input_type = "discrete"
|
|
72
|
+
name: str = "GroupByThenFreq"
|
|
73
|
+
is_grouping: bool = True
|
|
74
|
+
output_type: Optional[str] = "float"
|
|
75
|
+
is_distribution_dependent: bool = True
|
|
76
|
+
input_type: Optional[str] = "discrete"
|
|
77
77
|
|
|
78
78
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
79
79
|
def _f(x):
|
upgini/autofe/operand.py
CHANGED
|
@@ -8,19 +8,19 @@ from pydantic import BaseModel
|
|
|
8
8
|
|
|
9
9
|
class Operand(BaseModel):
|
|
10
10
|
name: str
|
|
11
|
-
alias: Optional[str]
|
|
11
|
+
alias: Optional[str] = None
|
|
12
12
|
is_unary: bool = False
|
|
13
13
|
is_symmetrical: bool = False
|
|
14
14
|
has_symmetry_importance: bool = False
|
|
15
|
-
input_type: Optional[str]
|
|
16
|
-
output_type: Optional[str]
|
|
15
|
+
input_type: Optional[str] = None
|
|
16
|
+
output_type: Optional[str] = None
|
|
17
17
|
is_categorical: bool = False
|
|
18
18
|
is_vectorizable: bool = False
|
|
19
19
|
is_grouping: bool = False
|
|
20
20
|
is_binary: bool = False
|
|
21
21
|
is_vector: bool = False
|
|
22
22
|
is_distribution_dependent: bool = False
|
|
23
|
-
params: Optional[Dict[str, str]]
|
|
23
|
+
params: Optional[Dict[str, str]] = None
|
|
24
24
|
|
|
25
25
|
def set_params(self, params: Dict[str, str]):
|
|
26
26
|
self.params = params
|
upgini/autofe/unary.py
CHANGED
|
@@ -1,15 +1,15 @@
|
|
|
1
|
+
from typing import Dict, Optional
|
|
1
2
|
import numpy as np
|
|
2
3
|
import pandas as pd
|
|
3
|
-
from sklearn.preprocessing import Normalizer
|
|
4
4
|
|
|
5
5
|
from upgini.autofe.operand import PandasOperand, VectorizableMixin
|
|
6
6
|
|
|
7
7
|
|
|
8
8
|
class Abs(PandasOperand, VectorizableMixin):
|
|
9
|
-
name = "abs"
|
|
10
|
-
is_unary = True
|
|
11
|
-
is_vectorizable = True
|
|
12
|
-
group_index = 0
|
|
9
|
+
name: str = "abs"
|
|
10
|
+
is_unary: bool = True
|
|
11
|
+
is_vectorizable: bool = True
|
|
12
|
+
group_index: int = 0
|
|
13
13
|
|
|
14
14
|
def calculate_unary(self, data: pd.Series) -> pd.Series:
|
|
15
15
|
return data.abs()
|
|
@@ -19,11 +19,11 @@ class Abs(PandasOperand, VectorizableMixin):
|
|
|
19
19
|
|
|
20
20
|
|
|
21
21
|
class Log(PandasOperand, VectorizableMixin):
|
|
22
|
-
name = "log"
|
|
23
|
-
is_unary = True
|
|
24
|
-
is_vectorizable = True
|
|
25
|
-
output_type = "float"
|
|
26
|
-
group_index = 0
|
|
22
|
+
name: str = "log"
|
|
23
|
+
is_unary: bool = True
|
|
24
|
+
is_vectorizable: bool = True
|
|
25
|
+
output_type: Optional[str] = "float"
|
|
26
|
+
group_index: int = 0
|
|
27
27
|
|
|
28
28
|
def calculate_unary(self, data: pd.Series) -> pd.Series:
|
|
29
29
|
return self._round_value(np.log(np.abs(data.replace(0, np.nan))), 10)
|
|
@@ -33,11 +33,11 @@ class Log(PandasOperand, VectorizableMixin):
|
|
|
33
33
|
|
|
34
34
|
|
|
35
35
|
class Sqrt(PandasOperand, VectorizableMixin):
|
|
36
|
-
name = "sqrt"
|
|
37
|
-
is_unary = True
|
|
38
|
-
is_vectorizable = True
|
|
39
|
-
output_type = "float"
|
|
40
|
-
group_index = 0
|
|
36
|
+
name: str = "sqrt"
|
|
37
|
+
is_unary: bool = True
|
|
38
|
+
is_vectorizable: bool = True
|
|
39
|
+
output_type: Optional[str] = "float"
|
|
40
|
+
group_index: int = 0
|
|
41
41
|
|
|
42
42
|
def calculate_unary(self, data: pd.Series) -> pd.Series:
|
|
43
43
|
return self._round_value(np.sqrt(np.abs(data)))
|
|
@@ -47,10 +47,10 @@ class Sqrt(PandasOperand, VectorizableMixin):
|
|
|
47
47
|
|
|
48
48
|
|
|
49
49
|
class Square(PandasOperand, VectorizableMixin):
|
|
50
|
-
name = "square"
|
|
51
|
-
is_unary = True
|
|
52
|
-
is_vectorizable = True
|
|
53
|
-
group_index = 0
|
|
50
|
+
name: str = "square"
|
|
51
|
+
is_unary: bool = True
|
|
52
|
+
is_vectorizable: bool = True
|
|
53
|
+
group_index: int = 0
|
|
54
54
|
|
|
55
55
|
def calculate_unary(self, data: pd.Series) -> pd.Series:
|
|
56
56
|
return np.square(data)
|
|
@@ -60,11 +60,11 @@ class Square(PandasOperand, VectorizableMixin):
|
|
|
60
60
|
|
|
61
61
|
|
|
62
62
|
class Sigmoid(PandasOperand, VectorizableMixin):
|
|
63
|
-
name = "sigmoid"
|
|
64
|
-
is_unary = True
|
|
65
|
-
is_vectorizable = True
|
|
66
|
-
output_type = "float"
|
|
67
|
-
group_index = 0
|
|
63
|
+
name: str = "sigmoid"
|
|
64
|
+
is_unary: bool = True
|
|
65
|
+
is_vectorizable: bool = True
|
|
66
|
+
output_type: Optional[str] = "float"
|
|
67
|
+
group_index: int = 0
|
|
68
68
|
|
|
69
69
|
def calculate_unary(self, data: pd.Series) -> pd.Series:
|
|
70
70
|
return self._round_value(1 / (1 + np.exp(-data)))
|
|
@@ -74,12 +74,12 @@ class Sigmoid(PandasOperand, VectorizableMixin):
|
|
|
74
74
|
|
|
75
75
|
|
|
76
76
|
class Floor(PandasOperand, VectorizableMixin):
|
|
77
|
-
name = "floor"
|
|
78
|
-
is_unary = True
|
|
79
|
-
is_vectorizable = True
|
|
80
|
-
output_type = "int"
|
|
81
|
-
input_type = "continuous"
|
|
82
|
-
group_index = 0
|
|
77
|
+
name: str = "floor"
|
|
78
|
+
is_unary: bool = True
|
|
79
|
+
is_vectorizable: bool = True
|
|
80
|
+
output_type: Optional[str] = "int"
|
|
81
|
+
input_type: Optional[str] = "continuous"
|
|
82
|
+
group_index: int = 0
|
|
83
83
|
|
|
84
84
|
def calculate_unary(self, data: pd.Series) -> pd.Series:
|
|
85
85
|
return np.floor(data)
|
|
@@ -89,11 +89,11 @@ class Floor(PandasOperand, VectorizableMixin):
|
|
|
89
89
|
|
|
90
90
|
|
|
91
91
|
class Residual(PandasOperand, VectorizableMixin):
|
|
92
|
-
name = "residual"
|
|
93
|
-
is_unary = True
|
|
94
|
-
is_vectorizable = True
|
|
95
|
-
input_type = "continuous"
|
|
96
|
-
group_index = 0
|
|
92
|
+
name: str = "residual"
|
|
93
|
+
is_unary: bool = True
|
|
94
|
+
is_vectorizable: bool = True
|
|
95
|
+
input_type: Optional[str] = "continuous"
|
|
96
|
+
group_index: int = 0
|
|
97
97
|
|
|
98
98
|
def calculate_unary(self, data: pd.Series) -> pd.Series:
|
|
99
99
|
return data - np.floor(data)
|
|
@@ -103,11 +103,11 @@ class Residual(PandasOperand, VectorizableMixin):
|
|
|
103
103
|
|
|
104
104
|
|
|
105
105
|
class Freq(PandasOperand):
|
|
106
|
-
name = "freq"
|
|
107
|
-
is_unary = True
|
|
108
|
-
output_type = "float"
|
|
109
|
-
is_distribution_dependent = True
|
|
110
|
-
input_type = "discrete"
|
|
106
|
+
name: str = "freq"
|
|
107
|
+
is_unary: bool = True
|
|
108
|
+
output_type: Optional[str] = "float"
|
|
109
|
+
is_distribution_dependent: bool = True
|
|
110
|
+
input_type: Optional[str] = "discrete"
|
|
111
111
|
|
|
112
112
|
def calculate_unary(self, data: pd.Series) -> pd.Series:
|
|
113
113
|
value_counts = data.value_counts(normalize=True)
|
|
@@ -115,23 +115,38 @@ class Freq(PandasOperand):
|
|
|
115
115
|
|
|
116
116
|
|
|
117
117
|
class Norm(PandasOperand):
|
|
118
|
-
name = "norm"
|
|
119
|
-
is_unary = True
|
|
120
|
-
output_type = "float"
|
|
118
|
+
name: str = "norm"
|
|
119
|
+
is_unary: bool = True
|
|
120
|
+
output_type: Optional[str] = "float"
|
|
121
|
+
norm: Optional[float] = None
|
|
121
122
|
|
|
122
123
|
def calculate_unary(self, data: pd.Series) -> pd.Series:
|
|
123
124
|
data_dropna = data.dropna()
|
|
124
125
|
if data_dropna.empty:
|
|
125
126
|
return data
|
|
126
127
|
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
128
|
+
if self.norm is not None:
|
|
129
|
+
normalized_data = data / self.norm
|
|
130
|
+
else:
|
|
131
|
+
self.norm = np.sqrt(np.sum(data * data))
|
|
132
|
+
normalized_data = data / self.norm
|
|
133
|
+
|
|
130
134
|
return normalized_data
|
|
131
135
|
|
|
136
|
+
def set_params(self, params: Dict[str, str]):
|
|
137
|
+
super().set_params(params)
|
|
138
|
+
if "norm" in params:
|
|
139
|
+
self.norm = params["norm"]
|
|
140
|
+
return self
|
|
141
|
+
|
|
142
|
+
def get_params(self) -> Dict[str, Optional[str]]:
|
|
143
|
+
res = super().get_params()
|
|
144
|
+
res["norm"] = self.norm
|
|
145
|
+
return res
|
|
146
|
+
|
|
132
147
|
|
|
133
148
|
class Embeddings(PandasOperand):
|
|
134
|
-
name = "emb"
|
|
135
|
-
is_unary = True
|
|
136
|
-
input_type = "string"
|
|
137
|
-
output_type = "vector"
|
|
149
|
+
name: str = "emb"
|
|
150
|
+
is_unary: bool = True
|
|
151
|
+
input_type: Optional[str] = "string"
|
|
152
|
+
output_type: Optional[str] = "vector"
|
upgini/autofe/vector.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import List
|
|
1
|
+
from typing import List, Optional
|
|
2
2
|
|
|
3
3
|
import pandas as pd
|
|
4
4
|
|
|
@@ -6,19 +6,19 @@ from upgini.autofe.operand import PandasOperand, VectorizableMixin
|
|
|
6
6
|
|
|
7
7
|
|
|
8
8
|
class Mean(PandasOperand, VectorizableMixin):
|
|
9
|
-
name = "mean"
|
|
10
|
-
output_type = "float"
|
|
11
|
-
is_vector = True
|
|
12
|
-
group_index = 0
|
|
9
|
+
name: str = "mean"
|
|
10
|
+
output_type: Optional[str] = "float"
|
|
11
|
+
is_vector: bool = True
|
|
12
|
+
group_index: int = 0
|
|
13
13
|
|
|
14
14
|
def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
|
|
15
15
|
return pd.DataFrame(data).T.fillna(0).mean(axis=1)
|
|
16
16
|
|
|
17
17
|
|
|
18
18
|
class Sum(PandasOperand, VectorizableMixin):
|
|
19
|
-
name = "sum"
|
|
20
|
-
is_vector = True
|
|
21
|
-
group_index = 0
|
|
19
|
+
name: str = "sum"
|
|
20
|
+
is_vector: bool = True
|
|
21
|
+
group_index: int = 0
|
|
22
22
|
|
|
23
23
|
def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
|
|
24
24
|
return pd.DataFrame(data).T.fillna(0).sum(axis=1)
|
upgini/dataset.py
CHANGED
|
@@ -18,6 +18,7 @@ from pandas.api.types import (
|
|
|
18
18
|
from upgini.errors import ValidationError
|
|
19
19
|
from upgini.http import ProgressStage, SearchProgress, _RestClient
|
|
20
20
|
from upgini.metadata import (
|
|
21
|
+
ENTITY_SYSTEM_RECORD_ID,
|
|
21
22
|
EVAL_SET_INDEX,
|
|
22
23
|
SYSTEM_RECORD_ID,
|
|
23
24
|
TARGET,
|
|
@@ -157,7 +158,11 @@ class Dataset: # (pd.DataFrame):
|
|
|
157
158
|
raise ValidationError(self.bundle.get("dataset_too_few_rows").format(self.MIN_ROWS_COUNT))
|
|
158
159
|
|
|
159
160
|
def __validate_max_row_count(self):
|
|
160
|
-
if
|
|
161
|
+
if ENTITY_SYSTEM_RECORD_ID in self.data.columns:
|
|
162
|
+
rows_count = self.data[ENTITY_SYSTEM_RECORD_ID].nunique()
|
|
163
|
+
else:
|
|
164
|
+
rows_count = len(self.data)
|
|
165
|
+
if rows_count > self.MAX_ROWS:
|
|
161
166
|
raise ValidationError(self.bundle.get("dataset_too_many_rows_registered").format(self.MAX_ROWS))
|
|
162
167
|
|
|
163
168
|
def __target_value(self) -> pd.Series:
|
|
@@ -199,14 +204,14 @@ class Dataset: # (pd.DataFrame):
|
|
|
199
204
|
elif self.task_type == ModelTaskType.REGRESSION:
|
|
200
205
|
if not is_float_dtype(target):
|
|
201
206
|
try:
|
|
202
|
-
self.data[target_column] = self.data[target_column].astype("
|
|
207
|
+
self.data[target_column] = self.data[target_column].astype("float64")
|
|
203
208
|
except ValueError:
|
|
204
209
|
self.logger.exception("Failed to cast target to float for regression task type")
|
|
205
210
|
raise ValidationError(self.bundle.get("dataset_invalid_regression_target").format(target.dtype))
|
|
206
211
|
elif self.task_type == ModelTaskType.TIMESERIES:
|
|
207
212
|
if not is_float_dtype(target):
|
|
208
213
|
try:
|
|
209
|
-
self.data[target_column] = self.data[target_column].astype("
|
|
214
|
+
self.data[target_column] = self.data[target_column].astype("float64")
|
|
210
215
|
except ValueError:
|
|
211
216
|
self.logger.exception("Failed to cast target to float for timeseries task type")
|
|
212
217
|
raise ValidationError(self.bundle.get("dataset_invalid_timeseries_target").format(target.dtype))
|
upgini/features_enricher.py
CHANGED
|
@@ -23,7 +23,6 @@ from pandas.api.types import (
|
|
|
23
23
|
is_datetime64_any_dtype,
|
|
24
24
|
is_numeric_dtype,
|
|
25
25
|
is_object_dtype,
|
|
26
|
-
is_period_dtype,
|
|
27
26
|
is_string_dtype,
|
|
28
27
|
)
|
|
29
28
|
from scipy.stats import ks_2samp
|
|
@@ -1408,7 +1407,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1408
1407
|
# TODO maybe there is no more need for these convertions
|
|
1409
1408
|
# Remove datetime features
|
|
1410
1409
|
datetime_features = [
|
|
1411
|
-
f
|
|
1410
|
+
f
|
|
1411
|
+
for f in fitting_X.columns
|
|
1412
|
+
if is_datetime64_any_dtype(fitting_X[f]) or isinstance(fitting_X[f].dtype, pd.PeriodDtype)
|
|
1412
1413
|
]
|
|
1413
1414
|
if len(datetime_features) > 0:
|
|
1414
1415
|
self.logger.warning(self.bundle.get("dataset_date_features").format(datetime_features))
|
|
@@ -2041,7 +2042,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2041
2042
|
|
|
2042
2043
|
df[ENTITY_SYSTEM_RECORD_ID] = pd.util.hash_pandas_object(
|
|
2043
2044
|
df[columns_for_system_record_id], index=False
|
|
2044
|
-
).astype("
|
|
2045
|
+
).astype("float64")
|
|
2045
2046
|
|
|
2046
2047
|
# Explode multiple search keys
|
|
2047
2048
|
df, unnest_search_keys = self._explode_multiple_search_keys(df, search_keys, columns_renaming)
|
|
@@ -2107,7 +2108,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2107
2108
|
# search keys might be changed after explode
|
|
2108
2109
|
columns_for_system_record_id = sorted(list(search_keys.keys()) + features_for_transform)
|
|
2109
2110
|
df[SYSTEM_RECORD_ID] = pd.util.hash_pandas_object(df[columns_for_system_record_id], index=False).astype(
|
|
2110
|
-
"
|
|
2111
|
+
"float64"
|
|
2111
2112
|
)
|
|
2112
2113
|
meaning_types[SYSTEM_RECORD_ID] = FileColumnMeaningType.SYSTEM_RECORD_ID
|
|
2113
2114
|
meaning_types[ENTITY_SYSTEM_RECORD_ID] = FileColumnMeaningType.ENTITY_SYSTEM_RECORD_ID
|
|
@@ -2667,6 +2668,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2667
2668
|
|
|
2668
2669
|
autofe_description = self.get_autofe_features_description()
|
|
2669
2670
|
if autofe_description is not None:
|
|
2671
|
+
self.logger.info(f"AutoFE descriptions: {autofe_description}")
|
|
2670
2672
|
display_html_dataframe(autofe_description, autofe_description, "*Description of AutoFE feature names")
|
|
2671
2673
|
|
|
2672
2674
|
if self._has_paid_features(exclude_features_sources):
|
upgini/http.py
CHANGED
|
@@ -39,18 +39,6 @@ from upgini.metadata import (
|
|
|
39
39
|
from upgini.resource_bundle import bundle
|
|
40
40
|
from upgini.utils.track_info import get_track_metrics
|
|
41
41
|
|
|
42
|
-
# try:
|
|
43
|
-
# from importlib.metadata import version # type: ignore
|
|
44
|
-
|
|
45
|
-
# __version__ = version("upgini")
|
|
46
|
-
# except ImportError:
|
|
47
|
-
# try:
|
|
48
|
-
# from importlib_metadata import version # type: ignore
|
|
49
|
-
|
|
50
|
-
# __version__ = version("upgini")
|
|
51
|
-
# except ImportError:
|
|
52
|
-
# __version__ = "Upgini wasn't installed"
|
|
53
|
-
|
|
54
42
|
UPGINI_URL: str = "UPGINI_URL"
|
|
55
43
|
UPGINI_API_KEY: str = "UPGINI_API_KEY"
|
|
56
44
|
DEMO_API_KEY: str = "Aa4BPwGFbn1zNEXIkZ-NbhsRk0ricN6puKuga1-O5lM"
|
|
@@ -471,7 +459,11 @@ class _RestClient:
|
|
|
471
459
|
dumps(track_metrics).encode(),
|
|
472
460
|
"application/json",
|
|
473
461
|
),
|
|
474
|
-
"metrics": (
|
|
462
|
+
"metrics": (
|
|
463
|
+
"metrics.json",
|
|
464
|
+
metrics.json(exclude_none=True).encode(),
|
|
465
|
+
"application/json",
|
|
466
|
+
),
|
|
475
467
|
"file": (metadata_with_md5.name, file, "application/octet-stream"),
|
|
476
468
|
}
|
|
477
469
|
if search_customization is not None:
|
|
@@ -555,7 +547,11 @@ class _RestClient:
|
|
|
555
547
|
dumps(get_track_metrics(self.client_ip, self.client_visitorid)).encode(),
|
|
556
548
|
"application/json",
|
|
557
549
|
),
|
|
558
|
-
"metrics": (
|
|
550
|
+
"metrics": (
|
|
551
|
+
"metrics.json",
|
|
552
|
+
metrics.json(exclude_none=True).encode(),
|
|
553
|
+
"application/json",
|
|
554
|
+
),
|
|
559
555
|
"file": (metadata_with_md5.name, file, "application/octet-stream"),
|
|
560
556
|
}
|
|
561
557
|
if search_customization is not None:
|
|
@@ -651,7 +647,11 @@ class _RestClient:
|
|
|
651
647
|
with open(file_path, "rb") as file:
|
|
652
648
|
files = {
|
|
653
649
|
"file": (metadata.name, file, "application/octet-stream"),
|
|
654
|
-
"metadata": (
|
|
650
|
+
"metadata": (
|
|
651
|
+
"metadata.json",
|
|
652
|
+
metadata.json(exclude_none=True).encode(),
|
|
653
|
+
"application/json",
|
|
654
|
+
),
|
|
655
655
|
}
|
|
656
656
|
|
|
657
657
|
return self._send_post_file_req_v2(api_path, files)
|
upgini/lazy_import.py
CHANGED
|
@@ -1,4 +1,6 @@
|
|
|
1
1
|
import importlib
|
|
2
|
+
import importlib.util
|
|
3
|
+
import importlib.machinery
|
|
2
4
|
|
|
3
5
|
|
|
4
6
|
class LazyImport:
|
|
@@ -10,7 +12,18 @@ class LazyImport:
|
|
|
10
12
|
|
|
11
13
|
def _load(self):
|
|
12
14
|
if self._module is None:
|
|
13
|
-
|
|
15
|
+
# Load module and save link to it
|
|
16
|
+
spec = importlib.util.find_spec(self.module_name)
|
|
17
|
+
if spec is None:
|
|
18
|
+
raise ImportError(f"Module {self.module_name} not found")
|
|
19
|
+
|
|
20
|
+
# Create module
|
|
21
|
+
self._module = importlib.util.module_from_spec(spec)
|
|
22
|
+
|
|
23
|
+
# Execute module
|
|
24
|
+
spec.loader.exec_module(self._module)
|
|
25
|
+
|
|
26
|
+
# Get class from module
|
|
14
27
|
self._class = getattr(self._module, self.class_name)
|
|
15
28
|
|
|
16
29
|
def __call__(self, *args, **kwargs):
|
upgini/metadata.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
from enum import Enum
|
|
4
|
-
from typing import Dict, List, Optional, Set, Union
|
|
4
|
+
from typing import Any, Dict, List, Optional, Set, Union
|
|
5
5
|
|
|
6
6
|
from pydantic import BaseModel
|
|
7
7
|
|
|
@@ -172,23 +172,23 @@ class FileMetricsInterval(BaseModel):
|
|
|
172
172
|
date_cut: float
|
|
173
173
|
count: float
|
|
174
174
|
valid_count: float
|
|
175
|
-
avg_target: Optional[float] # not for multiclass
|
|
176
|
-
avg_score_etalon: Optional[float]
|
|
175
|
+
avg_target: Optional[float] = None # not for multiclass
|
|
176
|
+
avg_score_etalon: Optional[float] = None
|
|
177
177
|
|
|
178
178
|
|
|
179
179
|
class FileMetrics(BaseModel):
|
|
180
180
|
# etalon metadata
|
|
181
|
-
task_type: Optional[ModelTaskType]
|
|
182
|
-
label: Optional[ModelLabelType]
|
|
183
|
-
count: Optional[int]
|
|
184
|
-
valid_count: Optional[int]
|
|
185
|
-
valid_rate: Optional[float]
|
|
186
|
-
avg_target: Optional[float]
|
|
187
|
-
metrics_binary_etalon: Optional[BinaryTask]
|
|
188
|
-
metrics_regression_etalon: Optional[RegressionTask]
|
|
189
|
-
metrics_multiclass_etalon: Optional[MulticlassTask]
|
|
190
|
-
cuts: Optional[List[float]]
|
|
191
|
-
interval: Optional[List[FileMetricsInterval]]
|
|
181
|
+
task_type: Optional[ModelTaskType] = None
|
|
182
|
+
label: Optional[ModelLabelType] = None
|
|
183
|
+
count: Optional[int] = None
|
|
184
|
+
valid_count: Optional[int] = None
|
|
185
|
+
valid_rate: Optional[float] = None
|
|
186
|
+
avg_target: Optional[float] = None
|
|
187
|
+
metrics_binary_etalon: Optional[BinaryTask] = None
|
|
188
|
+
metrics_regression_etalon: Optional[RegressionTask] = None
|
|
189
|
+
metrics_multiclass_etalon: Optional[MulticlassTask] = None
|
|
190
|
+
cuts: Optional[List[float]] = None
|
|
191
|
+
interval: Optional[List[FileMetricsInterval]] = None
|
|
192
192
|
|
|
193
193
|
|
|
194
194
|
class NumericInterval(BaseModel):
|
|
@@ -202,25 +202,25 @@ class FileColumnMetadata(BaseModel):
|
|
|
202
202
|
dataType: DataType
|
|
203
203
|
meaningType: FileColumnMeaningType
|
|
204
204
|
minMaxValues: Optional[NumericInterval] = None
|
|
205
|
-
originalName: Optional[str]
|
|
205
|
+
originalName: Optional[str] = None
|
|
206
206
|
# is this column contains keys from multiple key columns like msisdn1, msisdn2
|
|
207
207
|
isUnnest: bool = False
|
|
208
208
|
# list of original etalon key column names like msisdn1, msisdn2
|
|
209
|
-
unnestKeyNames: Optional[List[str]]
|
|
209
|
+
unnestKeyNames: Optional[List[str]] = None
|
|
210
210
|
|
|
211
211
|
|
|
212
212
|
class FileMetadata(BaseModel):
|
|
213
213
|
name: str
|
|
214
|
-
description: Optional[str]
|
|
214
|
+
description: Optional[str] = None
|
|
215
215
|
columns: List[FileColumnMetadata]
|
|
216
216
|
searchKeys: List[List[str]]
|
|
217
|
-
excludeFeaturesSources: Optional[List[str]]
|
|
218
|
-
hierarchicalGroupKeys: Optional[List[str]]
|
|
219
|
-
hierarchicalSubgroupKeys: Optional[List[str]]
|
|
220
|
-
taskType: Optional[ModelTaskType]
|
|
221
|
-
rowsCount: Optional[int]
|
|
222
|
-
checksumMD5: Optional[str]
|
|
223
|
-
digest: Optional[str]
|
|
217
|
+
excludeFeaturesSources: Optional[List[str]] = None
|
|
218
|
+
hierarchicalGroupKeys: Optional[List[str]] = None
|
|
219
|
+
hierarchicalSubgroupKeys: Optional[List[str]] = None
|
|
220
|
+
taskType: Optional[ModelTaskType] = None
|
|
221
|
+
rowsCount: Optional[int] = None
|
|
222
|
+
checksumMD5: Optional[str] = None
|
|
223
|
+
digest: Optional[str] = None
|
|
224
224
|
|
|
225
225
|
def column_by_name(self, name: str) -> Optional[FileColumnMetadata]:
|
|
226
226
|
for c in self.columns:
|
|
@@ -244,17 +244,17 @@ class FeaturesMetadataV2(BaseModel):
|
|
|
244
244
|
source: str
|
|
245
245
|
hit_rate: float
|
|
246
246
|
shap_value: float
|
|
247
|
-
commercial_schema: Optional[str]
|
|
248
|
-
data_provider: Optional[str]
|
|
249
|
-
data_providers: Optional[List[str]]
|
|
250
|
-
data_provider_link: Optional[str]
|
|
251
|
-
data_provider_links: Optional[List[str]]
|
|
252
|
-
data_source: Optional[str]
|
|
253
|
-
data_sources: Optional[List[str]]
|
|
254
|
-
data_source_link: Optional[str]
|
|
255
|
-
data_source_links: Optional[List[str]]
|
|
256
|
-
doc_link: Optional[str]
|
|
257
|
-
update_frequency: Optional[str]
|
|
247
|
+
commercial_schema: Optional[str] = None
|
|
248
|
+
data_provider: Optional[str] = None
|
|
249
|
+
data_providers: Optional[List[str]] = None
|
|
250
|
+
data_provider_link: Optional[str] = None
|
|
251
|
+
data_provider_links: Optional[List[str]] = None
|
|
252
|
+
data_source: Optional[str] = None
|
|
253
|
+
data_sources: Optional[List[str]] = None
|
|
254
|
+
data_source_link: Optional[str] = None
|
|
255
|
+
data_source_links: Optional[List[str]] = None
|
|
256
|
+
doc_link: Optional[str] = None
|
|
257
|
+
update_frequency: Optional[str] = None
|
|
258
258
|
|
|
259
259
|
|
|
260
260
|
class HitRateMetrics(BaseModel):
|
|
@@ -274,48 +274,48 @@ class ModelEvalSet(BaseModel):
|
|
|
274
274
|
class BaseColumnMetadata(BaseModel):
|
|
275
275
|
original_name: str
|
|
276
276
|
hashed_name: str
|
|
277
|
-
ads_definition_id: Optional[str]
|
|
277
|
+
ads_definition_id: Optional[str] = None
|
|
278
278
|
is_augmented: bool
|
|
279
279
|
|
|
280
280
|
|
|
281
281
|
class GeneratedFeatureMetadata(BaseModel):
|
|
282
|
-
alias: Optional[str]
|
|
282
|
+
alias: Optional[str] = None
|
|
283
283
|
formula: str
|
|
284
284
|
display_index: str
|
|
285
285
|
base_columns: List[BaseColumnMetadata]
|
|
286
|
-
operator_params: Optional[Dict[str, str]]
|
|
286
|
+
operator_params: Optional[Dict[str, str]] = None
|
|
287
287
|
|
|
288
288
|
|
|
289
289
|
class ProviderTaskMetadataV2(BaseModel):
|
|
290
290
|
features: List[FeaturesMetadataV2]
|
|
291
|
-
hit_rate_metrics: Optional[HitRateMetrics]
|
|
292
|
-
eval_set_metrics: Optional[List[ModelEvalSet]]
|
|
293
|
-
zero_hit_rate_search_keys: Optional[List[str]]
|
|
294
|
-
features_used_for_embeddings: Optional[List[str]]
|
|
295
|
-
shuffle_kfold: Optional[bool]
|
|
296
|
-
generated_features: Optional[List[GeneratedFeatureMetadata]]
|
|
291
|
+
hit_rate_metrics: Optional[HitRateMetrics] = None
|
|
292
|
+
eval_set_metrics: Optional[List[ModelEvalSet]] = None
|
|
293
|
+
zero_hit_rate_search_keys: Optional[List[str]] = None
|
|
294
|
+
features_used_for_embeddings: Optional[List[str]] = None
|
|
295
|
+
shuffle_kfold: Optional[bool] = None
|
|
296
|
+
generated_features: Optional[List[GeneratedFeatureMetadata]] = None
|
|
297
297
|
|
|
298
298
|
|
|
299
299
|
class FeaturesFilter(BaseModel):
|
|
300
|
-
minImportance: Optional[float]
|
|
301
|
-
maxPSI: Optional[float]
|
|
302
|
-
maxCount: Optional[int]
|
|
303
|
-
selectedFeatures: Optional[List[str]]
|
|
300
|
+
minImportance: Optional[float] = None
|
|
301
|
+
maxPSI: Optional[float] = None
|
|
302
|
+
maxCount: Optional[int] = None
|
|
303
|
+
selectedFeatures: Optional[List[str]] = None
|
|
304
304
|
|
|
305
305
|
|
|
306
306
|
class RuntimeParameters(BaseModel):
|
|
307
|
-
properties: Dict[str,
|
|
307
|
+
properties: Dict[str, Any] = {}
|
|
308
308
|
|
|
309
309
|
|
|
310
310
|
class SearchCustomization(BaseModel):
|
|
311
|
-
featuresFilter: Optional[FeaturesFilter]
|
|
312
|
-
extractFeatures: Optional[bool]
|
|
313
|
-
accurateModel: Optional[bool]
|
|
314
|
-
importanceThreshold: Optional[float]
|
|
315
|
-
maxFeatures: Optional[int]
|
|
316
|
-
returnScores: Optional[bool]
|
|
317
|
-
runtimeParameters: Optional[RuntimeParameters]
|
|
318
|
-
metricsCalculation: Optional[bool]
|
|
311
|
+
featuresFilter: Optional[FeaturesFilter] = None
|
|
312
|
+
extractFeatures: Optional[bool] = None
|
|
313
|
+
accurateModel: Optional[bool] = None
|
|
314
|
+
importanceThreshold: Optional[float] = None
|
|
315
|
+
maxFeatures: Optional[int] = None
|
|
316
|
+
returnScores: Optional[bool] = None
|
|
317
|
+
runtimeParameters: Optional[RuntimeParameters] = None
|
|
318
|
+
metricsCalculation: Optional[bool] = None
|
|
319
319
|
|
|
320
320
|
def __repr__(self):
|
|
321
321
|
return (
|
|
@@ -10,7 +10,6 @@ from pandas.api.types import (
|
|
|
10
10
|
is_float_dtype,
|
|
11
11
|
is_numeric_dtype,
|
|
12
12
|
is_object_dtype,
|
|
13
|
-
is_period_dtype,
|
|
14
13
|
is_string_dtype,
|
|
15
14
|
)
|
|
16
15
|
|
|
@@ -135,7 +134,7 @@ class Normalizer:
|
|
|
135
134
|
|
|
136
135
|
removed_features = []
|
|
137
136
|
for f in features:
|
|
138
|
-
if is_datetime(df[f]) or
|
|
137
|
+
if is_datetime(df[f]) or isinstance(df[f].dtype, pd.PeriodDtype):
|
|
139
138
|
removed_features.append(f)
|
|
140
139
|
df.drop(columns=f, inplace=True)
|
|
141
140
|
|
upgini/search_task.py
CHANGED
|
@@ -3,6 +3,7 @@ import tempfile
|
|
|
3
3
|
import time
|
|
4
4
|
from functools import lru_cache
|
|
5
5
|
from typing import Dict, List, Optional
|
|
6
|
+
import uuid
|
|
6
7
|
|
|
7
8
|
import pandas as pd
|
|
8
9
|
|
|
@@ -97,10 +98,7 @@ class SearchTask:
|
|
|
97
98
|
time.sleep(self.POLLING_DELAY_SECONDS)
|
|
98
99
|
except KeyboardInterrupt as e:
|
|
99
100
|
if not check_fit:
|
|
100
|
-
|
|
101
|
-
self.rest_client.stop_search_task_v2(trace_id, search_task_id)
|
|
102
|
-
self.logger.warning(f"Search {search_task_id} stopped by user")
|
|
103
|
-
print(bundle.get("search_stopped"))
|
|
101
|
+
self._stop(trace_id)
|
|
104
102
|
raise e
|
|
105
103
|
print()
|
|
106
104
|
|
|
@@ -133,6 +131,14 @@ class SearchTask:
|
|
|
133
131
|
|
|
134
132
|
return self
|
|
135
133
|
|
|
134
|
+
def _stop(self, trace_id: Optional[str] = None):
|
|
135
|
+
trace_id = trace_id or uuid.uuid4()
|
|
136
|
+
search_task_id = self.initial_search_task_id if self.initial_search_task_id is not None else self.search_task_id
|
|
137
|
+
print(bundle.get("search_stopping"))
|
|
138
|
+
self.rest_client.stop_search_task_v2(trace_id, search_task_id)
|
|
139
|
+
self.logger.warning(f"Search {search_task_id} stopped by user")
|
|
140
|
+
print(bundle.get("search_stopped"))
|
|
141
|
+
|
|
136
142
|
def get_all_features_metadata_v2(self) -> Optional[List[FeaturesMetadataV2]]:
|
|
137
143
|
if self.provider_metadata_v2 is None:
|
|
138
144
|
return None
|
upgini/utils/datetime_utils.py
CHANGED
|
@@ -6,7 +6,7 @@ from typing import Dict, List, Optional
|
|
|
6
6
|
import numpy as np
|
|
7
7
|
import pandas as pd
|
|
8
8
|
from dateutil.relativedelta import relativedelta
|
|
9
|
-
from pandas.api.types import is_numeric_dtype
|
|
9
|
+
from pandas.api.types import is_numeric_dtype
|
|
10
10
|
|
|
11
11
|
from upgini.errors import ValidationError
|
|
12
12
|
from upgini.metadata import EVAL_SET_INDEX, SearchKey
|
|
@@ -84,7 +84,7 @@ class DateTimeSearchKeyConverter:
|
|
|
84
84
|
df[self.date_column] = df[self.date_column].apply(lambda x: x.replace(tzinfo=None))
|
|
85
85
|
elif isinstance(df[self.date_column].values[0], datetime.date):
|
|
86
86
|
df[self.date_column] = pd.to_datetime(df[self.date_column], errors="coerce")
|
|
87
|
-
elif
|
|
87
|
+
elif isinstance(df[self.date_column].dtype, pd.PeriodDtype):
|
|
88
88
|
df[self.date_column] = df[self.date_column].dt.to_timestamp()
|
|
89
89
|
elif is_numeric_dtype(df[self.date_column]):
|
|
90
90
|
# 315532801 - 2524608001 - seconds
|
|
@@ -207,7 +207,7 @@ def is_time_series(df: pd.DataFrame, date_col: str) -> bool:
|
|
|
207
207
|
def is_blocked_time_series(df: pd.DataFrame, date_col: str, search_keys: List[str]) -> bool:
|
|
208
208
|
df = df.copy()
|
|
209
209
|
seconds = "datetime_seconds"
|
|
210
|
-
if
|
|
210
|
+
if isinstance(df[date_col].dtype, pd.PeriodDtype):
|
|
211
211
|
df[date_col] = df[date_col].dt.to_timestamp()
|
|
212
212
|
else:
|
|
213
213
|
df[date_col] = pd.to_datetime(df[date_col])
|
|
@@ -275,7 +275,7 @@ def validate_dates_distribution(
|
|
|
275
275
|
if col in search_keys:
|
|
276
276
|
continue
|
|
277
277
|
try:
|
|
278
|
-
if
|
|
278
|
+
if isinstance(X[col].dtype, pd.PeriodDtype):
|
|
279
279
|
pass
|
|
280
280
|
elif pd.__version__ >= "2.0.0":
|
|
281
281
|
# Format mixed to avoid massive warnings
|
|
@@ -290,7 +290,7 @@ def validate_dates_distribution(
|
|
|
290
290
|
if maybe_date_col is None:
|
|
291
291
|
return
|
|
292
292
|
|
|
293
|
-
if
|
|
293
|
+
if isinstance(X[maybe_date_col].dtype, pd.PeriodDtype):
|
|
294
294
|
dates = X[maybe_date_col].dt.to_timestamp().dt.date
|
|
295
295
|
elif pd.__version__ >= "2.0.0":
|
|
296
296
|
dates = pd.to_datetime(X[maybe_date_col], format="mixed").dt.date
|
upgini/utils/phone_utils.py
CHANGED
|
@@ -1,12 +1,8 @@
|
|
|
1
1
|
from typing import Optional
|
|
2
2
|
|
|
3
|
+
import numpy as np
|
|
3
4
|
import pandas as pd
|
|
4
|
-
from pandas.api.types import
|
|
5
|
-
is_float_dtype,
|
|
6
|
-
is_int64_dtype,
|
|
7
|
-
is_object_dtype,
|
|
8
|
-
is_string_dtype,
|
|
9
|
-
)
|
|
5
|
+
from pandas.api.types import is_float_dtype, is_object_dtype, is_string_dtype
|
|
10
6
|
|
|
11
7
|
from upgini.errors import ValidationError
|
|
12
8
|
from upgini.utils.base_search_key_detector import BaseSearchKeyDetector
|
|
@@ -63,7 +59,9 @@ class PhoneSearchKeyConverter:
|
|
|
63
59
|
convert_func = self.phone_str_to_int_safe
|
|
64
60
|
elif is_float_dtype(df[self.phone_column]):
|
|
65
61
|
convert_func = self.phone_float_to_int_safe
|
|
66
|
-
elif
|
|
62
|
+
elif df[self.phone_column].dtype == np.int64 or isinstance(
|
|
63
|
+
df[self.phone_column].dtype, pd.Int64Dtype
|
|
64
|
+
):
|
|
67
65
|
convert_func = self.phone_int_to_int_safe
|
|
68
66
|
else:
|
|
69
67
|
raise ValidationError(
|
|
@@ -25,7 +25,7 @@ class PostalCodeSearchKeyConverter:
|
|
|
25
25
|
if is_string_dtype(df[self.postal_code_column]) or is_object_dtype(df[self.postal_code_column]):
|
|
26
26
|
try:
|
|
27
27
|
df[self.postal_code_column] = (
|
|
28
|
-
df[self.postal_code_column].astype("string").astype("
|
|
28
|
+
df[self.postal_code_column].astype("string").astype("float64").astype("Int64").astype("string")
|
|
29
29
|
)
|
|
30
30
|
except Exception:
|
|
31
31
|
pass
|
upgini/utils/target_utils.py
CHANGED
|
@@ -194,4 +194,7 @@ def calculate_psi(expected: pd.Series, actual: pd.Series) -> float:
|
|
|
194
194
|
test_distribution = actual.value_counts(bins=bins, normalize=True).sort_index().values
|
|
195
195
|
|
|
196
196
|
# Calculate the PSI
|
|
197
|
-
|
|
197
|
+
try:
|
|
198
|
+
return np.sum((train_distribution - test_distribution) * np.log(train_distribution / test_distribution))
|
|
199
|
+
except Exception:
|
|
200
|
+
return np.nan
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: upgini
|
|
3
|
-
Version: 1.1
|
|
3
|
+
Version: 1.2.1
|
|
4
4
|
Summary: Intelligent data search & enrichment for Machine Learning
|
|
5
5
|
Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
|
|
6
6
|
Project-URL: Homepage, https://upgini.com/
|
|
@@ -29,9 +29,9 @@ Requires-Dist: ipywidgets>=8.1.0
|
|
|
29
29
|
Requires-Dist: jarowinkler>=2.0.0
|
|
30
30
|
Requires-Dist: levenshtein>=0.25.1
|
|
31
31
|
Requires-Dist: lightgbm>=3.3.2
|
|
32
|
-
Requires-Dist: numpy
|
|
32
|
+
Requires-Dist: numpy<=1.26.4,>=1.19.0
|
|
33
33
|
Requires-Dist: pandas<3.0.0,>=1.1.0
|
|
34
|
-
Requires-Dist: pydantic<
|
|
34
|
+
Requires-Dist: pydantic<3.0.0,>1.0.0
|
|
35
35
|
Requires-Dist: pyjwt>=2.8.0
|
|
36
36
|
Requires-Dist: python-bidi==0.4.2
|
|
37
37
|
Requires-Dist: python-dateutil>=2.8.0
|
|
@@ -1,33 +1,33 @@
|
|
|
1
|
-
upgini/__about__.py,sha256=
|
|
1
|
+
upgini/__about__.py,sha256=Mlm4Gvmb_6yQxwUbv2Ksc-BJFXLPg9H1Vt2iV7wXrA4,22
|
|
2
2
|
upgini/__init__.py,sha256=Xs0YFVBu1KUdtZzbStGRPQtLt3YLzJnjx5nIUBlX8BE,415
|
|
3
3
|
upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
|
|
4
|
-
upgini/dataset.py,sha256=
|
|
4
|
+
upgini/dataset.py,sha256=olZ-OHSfBNoBSCo7R5t7uCLukI2nO7afpx_A-HCiJLk,31067
|
|
5
5
|
upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
|
|
6
|
-
upgini/features_enricher.py,sha256=
|
|
7
|
-
upgini/http.py,sha256=
|
|
8
|
-
upgini/lazy_import.py,sha256=
|
|
9
|
-
upgini/metadata.py,sha256=
|
|
6
|
+
upgini/features_enricher.py,sha256=twH4qdl91iHZF_AraLk0aIbRDw61S_DYtCWCZ34Yjjg,188077
|
|
7
|
+
upgini/http.py,sha256=21asexflvavydzCOONJDGQBtQanCElrbnqLXakJ9Cu8,42880
|
|
8
|
+
upgini/lazy_import.py,sha256=74gQ8JuA48BGRLxAo7lNHNKY2D2emMxrUxKGdxVGhuY,1012
|
|
9
|
+
upgini/metadata.py,sha256=osmzdNESeh7yP3BZday6N9Q3eaIHfzhhRM1d6NSgcf0,11223
|
|
10
10
|
upgini/metrics.py,sha256=Tu5cN8RlhOSSMWUTXRSkdl8SWBqR1N_2eJpBum9pZxc,30926
|
|
11
|
-
upgini/search_task.py,sha256=
|
|
11
|
+
upgini/search_task.py,sha256=qxUxAD-bed-FpZYmTB_4orW7YJsW_O6a1TcgnZIRFr4,17307
|
|
12
12
|
upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
|
|
13
13
|
upgini/version_validator.py,sha256=ddSKUK_-eGJB3NgrqOMoWJU-OxQ253WsNLp8aqJkaIM,1389
|
|
14
14
|
upgini/ads_management/__init__.py,sha256=qzyisOToVRP-tquAJD1PblZhNtMrOB8FiyF9JvfkvgE,50
|
|
15
15
|
upgini/ads_management/ads_manager.py,sha256=igVbN2jz80Umb2BUJixmJVj-zx8unoKpecVo-R-nGdw,2648
|
|
16
16
|
upgini/autofe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
17
17
|
upgini/autofe/all_operands.py,sha256=3LiH9iU-ArGmYpS8FHWH7yCFx40ILfvlSXJlKIa75BQ,2542
|
|
18
|
-
upgini/autofe/binary.py,sha256=
|
|
19
|
-
upgini/autofe/date.py,sha256=
|
|
18
|
+
upgini/autofe/binary.py,sha256=TRjEdxsfyPY5E8ksYfdKMmU6GtvALfGFPNVIG7DBhzM,7520
|
|
19
|
+
upgini/autofe/date.py,sha256=OpFc3Al0xO3qlESn2Uokfxw51ArVqmh3xngWwdrsaqE,9762
|
|
20
20
|
upgini/autofe/feature.py,sha256=gwGWY2UcX_0wHAvfEiu1rRU7GFZyzMWZIaPVcf6kD80,14223
|
|
21
|
-
upgini/autofe/groupby.py,sha256=
|
|
22
|
-
upgini/autofe/operand.py,sha256=
|
|
23
|
-
upgini/autofe/unary.py,sha256=
|
|
24
|
-
upgini/autofe/vector.py,sha256=
|
|
21
|
+
upgini/autofe/groupby.py,sha256=r-xl_keZZgm_tpiEoDhjYSkT6NHv7a4cRQR4wJ4uCp8,3263
|
|
22
|
+
upgini/autofe/operand.py,sha256=uk883RaNqgXqtkaRqA1re1d9OFnnpv0JVvelYx09Yw0,2943
|
|
23
|
+
upgini/autofe/unary.py,sha256=VTX5BdPJUArt-H9qYfOghKw_WlUatFjZto6zu3KGjb4,4484
|
|
24
|
+
upgini/autofe/vector.py,sha256=ehcZUDqV71TfbU8EmKfdYp603gS2dJY_-fpr10ho5sI,663
|
|
25
25
|
upgini/data_source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
26
26
|
upgini/data_source/data_source_publisher.py,sha256=Vg0biG86YB0OEaoxbK9YYrr4yARm11_h3bTWIBgoScA,22115
|
|
27
27
|
upgini/mdc/__init__.py,sha256=aM08nIWFc2gWdWUa3_IuEnNND0cQPkBGnYpRMnfFN8k,1019
|
|
28
28
|
upgini/mdc/context.py,sha256=3u1B-jXt7tXEvNcV3qmR9SDCseudnY7KYsLclBdwVLk,1405
|
|
29
29
|
upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
30
|
-
upgini/normalizer/normalize_utils.py,sha256=
|
|
30
|
+
upgini/normalizer/normalize_utils.py,sha256=bHRPWCNrUvt2R9qMX6dZFCJ0i8ENVCQ2Rw3dHH9IJEg,7447
|
|
31
31
|
upgini/resource_bundle/__init__.py,sha256=S5F2G47pnJd2LDpmFsjDqEwiKkP8Hm-hcseDbMka6Ko,8345
|
|
32
32
|
upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
|
|
33
33
|
upgini/resource_bundle/strings.properties,sha256=WZAuYPX2Dpn6BHoA3RX8uvMNMr-yJE2fF7Gz0i24x2s,26459
|
|
@@ -42,7 +42,7 @@ upgini/utils/blocked_time_series.py,sha256=Uqr3vp4YqNclj2-PzEYqVy763GSXHn86sbpIl
|
|
|
42
42
|
upgini/utils/country_utils.py,sha256=lY-eXWwFVegdVENFttbvLcgGDjFO17Sex8hd2PyJaRk,6937
|
|
43
43
|
upgini/utils/custom_loss_utils.py,sha256=kieNZYBYZm5ZGBltF1F_jOSF4ea6C29rYuCyiDcqVNY,3857
|
|
44
44
|
upgini/utils/cv_utils.py,sha256=w6FQb9nO8BWDx88EF83NpjPLarK4eR4ia0Wg0kLBJC4,3525
|
|
45
|
-
upgini/utils/datetime_utils.py,sha256=
|
|
45
|
+
upgini/utils/datetime_utils.py,sha256=4tsGeehU0KS6wqNsc9gEEWZ9s6T9E0UReUIO3rSuXNU,12174
|
|
46
46
|
upgini/utils/deduplicate_utils.py,sha256=Zvs7zW4QzaERQmJNPrTVf2ZTVBkBLOycFCzyMwtXuV8,8770
|
|
47
47
|
upgini/utils/display_utils.py,sha256=A2ouB5eiZ-Kyt9ykYxkLQwyoRPrdYeJymwNTiajtFXs,10990
|
|
48
48
|
upgini/utils/email_utils.py,sha256=j0Ug1R_0AnCg1Y92zIZ4XMwvKo3G5_pcOlBN1OH_gZs,5191
|
|
@@ -50,14 +50,14 @@ upgini/utils/fallback_progress_bar.py,sha256=PDaKb8dYpVZaWMroNcOHsTc3pSjgi9mOm0-
|
|
|
50
50
|
upgini/utils/features_validator.py,sha256=PgKNt5dyqfErTvjtRNNUS9g7GFqHBtAtnsfA-V5UO1A,3307
|
|
51
51
|
upgini/utils/format.py,sha256=Yv5cvvSs2bOLUzzNu96Pu33VMDNbabio92QepUj41jU,243
|
|
52
52
|
upgini/utils/ip_utils.py,sha256=ZZj_uQFTHhagzt-MRew__ZBOp2DdnkMrachS7PElkSE,5143
|
|
53
|
-
upgini/utils/phone_utils.py,sha256=
|
|
54
|
-
upgini/utils/postal_code_utils.py,sha256=
|
|
53
|
+
upgini/utils/phone_utils.py,sha256=IrbztLuOJBiePqqxllfABWfYlfAjYevPhXKipl95wUI,10432
|
|
54
|
+
upgini/utils/postal_code_utils.py,sha256=5M0sUqH2DAr33kARWCTXR-ACyzWbjDq_-0mmEml6ZcU,1716
|
|
55
55
|
upgini/utils/progress_bar.py,sha256=N-Sfdah2Hg8lXP_fV9EfUTXz_PyRt4lo9fAHoUDOoLc,1550
|
|
56
56
|
upgini/utils/sklearn_ext.py,sha256=13jQS_k7v0aUtudXV6nGUEWjttPQzAW9AFYL5wgEz9k,44511
|
|
57
|
-
upgini/utils/target_utils.py,sha256=
|
|
57
|
+
upgini/utils/target_utils.py,sha256=BVtDmrmFMKerSUWaNOIEdzsYHIFiODdpnWbE50QDPDc,7864
|
|
58
58
|
upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
|
|
59
59
|
upgini/utils/warning_counter.py,sha256=dIWBB4dI5XRRJZudvIlqlIYKEiwLLPcXarsZuYRt338,227
|
|
60
|
-
upgini-1.1.
|
|
61
|
-
upgini-1.1.
|
|
62
|
-
upgini-1.1.
|
|
63
|
-
upgini-1.1.
|
|
60
|
+
upgini-1.2.1.dist-info/METADATA,sha256=6kMBYOGEY3dcShtSVLc1Qo9kyL8dKwdmFXZXhM1vzFA,48228
|
|
61
|
+
upgini-1.2.1.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
|
|
62
|
+
upgini-1.2.1.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
|
63
|
+
upgini-1.2.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|