upgini 1.1.316__py3-none-any.whl → 1.1.316a2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/__about__.py +1 -1
- upgini/autofe/binary.py +72 -75
- upgini/autofe/date.py +21 -21
- upgini/autofe/feature.py +2 -2
- upgini/autofe/groupby.py +22 -22
- upgini/autofe/operand.py +4 -4
- upgini/autofe/unary.py +47 -46
- upgini/autofe/vector.py +8 -8
- upgini/features_enricher.py +3 -2
- upgini/http.py +32 -32
- upgini/lazy_import.py +14 -1
- upgini/metadata.py +57 -57
- upgini/normalizer/normalize_utils.py +1 -2
- upgini/utils/datetime_utils.py +5 -5
- upgini/utils/phone_utils.py +5 -7
- upgini/utils/target_utils.py +4 -1
- {upgini-1.1.316.dist-info → upgini-1.1.316a2.dist-info}/METADATA +3 -3
- {upgini-1.1.316.dist-info → upgini-1.1.316a2.dist-info}/RECORD +20 -20
- {upgini-1.1.316.dist-info → upgini-1.1.316a2.dist-info}/WHEEL +0 -0
- {upgini-1.1.316.dist-info → upgini-1.1.316a2.dist-info}/licenses/LICENSE +0 -0
upgini/__about__.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "1.1.
|
|
1
|
+
__version__ = "1.1.316a2"
|
upgini/autofe/binary.py
CHANGED
|
@@ -9,32 +9,32 @@ from upgini.autofe.operand import PandasOperand, VectorizableMixin
|
|
|
9
9
|
|
|
10
10
|
|
|
11
11
|
class Min(PandasOperand):
|
|
12
|
-
name = "min"
|
|
13
|
-
is_binary = True
|
|
14
|
-
is_symmetrical = True
|
|
15
|
-
has_symmetry_importance = True
|
|
12
|
+
name: str = "min"
|
|
13
|
+
is_binary: bool = True
|
|
14
|
+
is_symmetrical: bool = True
|
|
15
|
+
has_symmetry_importance: bool = True
|
|
16
16
|
|
|
17
17
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
18
18
|
return np.minimum(left, right)
|
|
19
19
|
|
|
20
20
|
|
|
21
21
|
class Max(PandasOperand):
|
|
22
|
-
name = "max"
|
|
23
|
-
is_binary = True
|
|
24
|
-
is_symmetrical = True
|
|
25
|
-
has_symmetry_importance = True
|
|
22
|
+
name: str = "max"
|
|
23
|
+
is_binary: bool = True
|
|
24
|
+
is_symmetrical: bool = True
|
|
25
|
+
has_symmetry_importance: bool = True
|
|
26
26
|
|
|
27
27
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
28
28
|
return np.maximum(left, right)
|
|
29
29
|
|
|
30
30
|
|
|
31
31
|
class Add(PandasOperand, VectorizableMixin):
|
|
32
|
-
name = "+"
|
|
33
|
-
alias = "add"
|
|
34
|
-
is_binary = True
|
|
35
|
-
is_symmetrical = True
|
|
36
|
-
has_symmetry_importance = True
|
|
37
|
-
is_vectorizable = True
|
|
32
|
+
name: str = "+"
|
|
33
|
+
alias: str = "add"
|
|
34
|
+
is_binary: bool = True
|
|
35
|
+
is_symmetrical: bool = True
|
|
36
|
+
has_symmetry_importance: bool = True
|
|
37
|
+
is_vectorizable: bool = True
|
|
38
38
|
|
|
39
39
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
40
40
|
return left + right
|
|
@@ -48,12 +48,12 @@ class Add(PandasOperand, VectorizableMixin):
|
|
|
48
48
|
|
|
49
49
|
|
|
50
50
|
class Subtract(PandasOperand, VectorizableMixin):
|
|
51
|
-
name = "-"
|
|
52
|
-
alias = "sub"
|
|
53
|
-
is_binary = True
|
|
54
|
-
is_symmetrical = True
|
|
55
|
-
has_symmetry_importance = True
|
|
56
|
-
is_vectorizable = True
|
|
51
|
+
name: str = "-"
|
|
52
|
+
alias: str = "sub"
|
|
53
|
+
is_binary: bool = True
|
|
54
|
+
is_symmetrical: bool = True
|
|
55
|
+
has_symmetry_importance: bool = True
|
|
56
|
+
is_vectorizable: bool = True
|
|
57
57
|
|
|
58
58
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
59
59
|
return left - right
|
|
@@ -67,12 +67,12 @@ class Subtract(PandasOperand, VectorizableMixin):
|
|
|
67
67
|
|
|
68
68
|
|
|
69
69
|
class Multiply(PandasOperand, VectorizableMixin):
|
|
70
|
-
name = "*"
|
|
71
|
-
alias = "mul"
|
|
72
|
-
is_binary = True
|
|
73
|
-
is_symmetrical = True
|
|
74
|
-
has_symmetry_importance = True
|
|
75
|
-
is_vectorizable = True
|
|
70
|
+
name: str = "*"
|
|
71
|
+
alias: str = "mul"
|
|
72
|
+
is_binary: bool = True
|
|
73
|
+
is_symmetrical: bool = True
|
|
74
|
+
has_symmetry_importance: bool = True
|
|
75
|
+
is_vectorizable: bool = True
|
|
76
76
|
|
|
77
77
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
78
78
|
return left * right
|
|
@@ -86,12 +86,12 @@ class Multiply(PandasOperand, VectorizableMixin):
|
|
|
86
86
|
|
|
87
87
|
|
|
88
88
|
class Divide(PandasOperand, VectorizableMixin):
|
|
89
|
-
name = "/"
|
|
90
|
-
alias = "div"
|
|
91
|
-
is_binary = True
|
|
92
|
-
has_symmetry_importance = True
|
|
93
|
-
is_vectorizable = True
|
|
94
|
-
output_type = "float"
|
|
89
|
+
name: str = "/"
|
|
90
|
+
alias: str = "div"
|
|
91
|
+
is_binary: bool = True
|
|
92
|
+
has_symmetry_importance: bool = True
|
|
93
|
+
is_vectorizable: bool = True
|
|
94
|
+
output_type: Optional[str] = "float"
|
|
95
95
|
|
|
96
96
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
97
97
|
return left / right.replace(0, np.nan)
|
|
@@ -105,10 +105,10 @@ class Divide(PandasOperand, VectorizableMixin):
|
|
|
105
105
|
|
|
106
106
|
|
|
107
107
|
class Combine(PandasOperand):
|
|
108
|
-
name = "Combine"
|
|
109
|
-
is_binary = True
|
|
110
|
-
has_symmetry_importance = True
|
|
111
|
-
output_type = "object"
|
|
108
|
+
name: str = "Combine"
|
|
109
|
+
is_binary: bool = True
|
|
110
|
+
has_symmetry_importance: bool = True
|
|
111
|
+
output_type: Optional[str] = "object"
|
|
112
112
|
|
|
113
113
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
114
114
|
temp = left.astype(str) + "_" + right.astype(str)
|
|
@@ -117,13 +117,13 @@ class Combine(PandasOperand):
|
|
|
117
117
|
|
|
118
118
|
|
|
119
119
|
class CombineThenFreq(PandasOperand):
|
|
120
|
-
name = "CombineThenFreq"
|
|
121
|
-
is_binary = True
|
|
122
|
-
is_symmetrical = True
|
|
123
|
-
has_symmetry_importance = True
|
|
124
|
-
output_type = "float"
|
|
125
|
-
is_distribution_dependent = True
|
|
126
|
-
input_type = "discrete"
|
|
120
|
+
name: str = "CombineThenFreq"
|
|
121
|
+
is_binary: bool = True
|
|
122
|
+
is_symmetrical: bool = True
|
|
123
|
+
has_symmetry_importance: bool = True
|
|
124
|
+
output_type: Optional[str] = "float"
|
|
125
|
+
is_distribution_dependent: bool = True
|
|
126
|
+
input_type: Optional[str] = "discrete"
|
|
127
127
|
|
|
128
128
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
129
129
|
temp = left.astype(str) + "_" + right.astype(str)
|
|
@@ -133,15 +133,15 @@ class CombineThenFreq(PandasOperand):
|
|
|
133
133
|
|
|
134
134
|
|
|
135
135
|
class Distance(PandasOperand):
|
|
136
|
-
name = "dist"
|
|
137
|
-
is_binary = True
|
|
138
|
-
output_type = "float"
|
|
139
|
-
is_symmetrical = True
|
|
140
|
-
has_symmetry_importance = True
|
|
136
|
+
name: str = "dist"
|
|
137
|
+
is_binary: bool = True
|
|
138
|
+
output_type: Optional[str] = "float"
|
|
139
|
+
is_symmetrical: bool = True
|
|
140
|
+
has_symmetry_importance: bool = True
|
|
141
141
|
|
|
142
142
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
143
143
|
return pd.Series(
|
|
144
|
-
1 - self.__dot(left, right) / (self.
|
|
144
|
+
1 - self.__dot(left, right) / (self.__dot(left, left) * self.__dot(right, right)), index=left.index
|
|
145
145
|
)
|
|
146
146
|
|
|
147
147
|
# row-wise dot product
|
|
@@ -152,17 +152,14 @@ class Distance(PandasOperand):
|
|
|
152
152
|
res = res.reindex(left.index.union(right.index))
|
|
153
153
|
return res
|
|
154
154
|
|
|
155
|
-
def __norm(self, vector: pd.Series) -> pd.Series:
|
|
156
|
-
return np.sqrt(self.__dot(vector, vector))
|
|
157
|
-
|
|
158
155
|
|
|
159
156
|
# Left for backward compatibility
|
|
160
157
|
class Sim(Distance):
|
|
161
|
-
name = "sim"
|
|
162
|
-
is_binary = True
|
|
163
|
-
output_type = "float"
|
|
164
|
-
is_symmetrical = True
|
|
165
|
-
has_symmetry_importance = True
|
|
158
|
+
name: str = "sim"
|
|
159
|
+
is_binary: bool = True
|
|
160
|
+
output_type: Optional[str] = "float"
|
|
161
|
+
is_symmetrical: bool = True
|
|
162
|
+
has_symmetry_importance: bool = True
|
|
166
163
|
|
|
167
164
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
168
165
|
return 1 - super().calculate_binary(left, right)
|
|
@@ -191,12 +188,12 @@ class StringSim(PandasOperand, abc.ABC):
|
|
|
191
188
|
|
|
192
189
|
|
|
193
190
|
class JaroWinklerSim1(StringSim):
|
|
194
|
-
name = "sim_jw1"
|
|
195
|
-
is_binary = True
|
|
196
|
-
input_type = "string"
|
|
197
|
-
output_type = "float"
|
|
198
|
-
is_symmetrical = True
|
|
199
|
-
has_symmetry_importance = True
|
|
191
|
+
name: str = "sim_jw1"
|
|
192
|
+
is_binary: bool = True
|
|
193
|
+
input_type: Optional[str] = "string"
|
|
194
|
+
output_type: Optional[str] = "float"
|
|
195
|
+
is_symmetrical: bool = True
|
|
196
|
+
has_symmetry_importance: bool = True
|
|
200
197
|
|
|
201
198
|
def _prepare_value(self, value: Optional[str]) -> Optional[str]:
|
|
202
199
|
return value
|
|
@@ -206,12 +203,12 @@ class JaroWinklerSim1(StringSim):
|
|
|
206
203
|
|
|
207
204
|
|
|
208
205
|
class JaroWinklerSim2(StringSim):
|
|
209
|
-
name = "sim_jw2"
|
|
210
|
-
is_binary = True
|
|
211
|
-
input_type = "string"
|
|
212
|
-
output_type = "float"
|
|
213
|
-
is_symmetrical = True
|
|
214
|
-
has_symmetry_importance = True
|
|
206
|
+
name: str = "sim_jw2"
|
|
207
|
+
is_binary: bool = True
|
|
208
|
+
input_type: Optional[str] = "string"
|
|
209
|
+
output_type: Optional[str] = "float"
|
|
210
|
+
is_symmetrical: bool = True
|
|
211
|
+
has_symmetry_importance: bool = True
|
|
215
212
|
|
|
216
213
|
def _prepare_value(self, value: Optional[str]) -> Optional[str]:
|
|
217
214
|
return value[::-1] if value is not None else None
|
|
@@ -221,12 +218,12 @@ class JaroWinklerSim2(StringSim):
|
|
|
221
218
|
|
|
222
219
|
|
|
223
220
|
class LevenshteinSim(StringSim):
|
|
224
|
-
name = "sim_lv"
|
|
225
|
-
is_binary = True
|
|
226
|
-
input_type = "string"
|
|
227
|
-
output_type = "float"
|
|
228
|
-
is_symmetrical = True
|
|
229
|
-
has_symmetry_importance = True
|
|
221
|
+
name: str = "sim_lv"
|
|
222
|
+
is_binary: bool = True
|
|
223
|
+
input_type: Optional[str] = "string"
|
|
224
|
+
output_type: Optional[str] = "float"
|
|
225
|
+
is_symmetrical: bool = True
|
|
226
|
+
has_symmetry_importance: bool = True
|
|
230
227
|
|
|
231
228
|
def _prepare_value(self, value: Optional[str]) -> Optional[str]:
|
|
232
229
|
return value
|
upgini/autofe/date.py
CHANGED
|
@@ -4,7 +4,7 @@ from typing import Any, Dict, List, Optional, Union
|
|
|
4
4
|
import numpy as np
|
|
5
5
|
import pandas as pd
|
|
6
6
|
from pandas.core.arrays.timedeltas import TimedeltaArray
|
|
7
|
-
from pydantic import BaseModel,
|
|
7
|
+
from pydantic import BaseModel, field_validator
|
|
8
8
|
|
|
9
9
|
from upgini.autofe.operand import PandasOperand
|
|
10
10
|
|
|
@@ -38,10 +38,10 @@ class DateDiffMixin(BaseModel):
|
|
|
38
38
|
|
|
39
39
|
|
|
40
40
|
class DateDiff(PandasOperand, DateDiffMixin):
|
|
41
|
-
name = "date_diff"
|
|
42
|
-
alias = "date_diff_type1"
|
|
43
|
-
is_binary = True
|
|
44
|
-
has_symmetry_importance = True
|
|
41
|
+
name: str = "date_diff"
|
|
42
|
+
alias: Optional[str] = "date_diff_type1"
|
|
43
|
+
is_binary: bool = True
|
|
44
|
+
has_symmetry_importance: bool = True
|
|
45
45
|
|
|
46
46
|
replace_negative: bool = False
|
|
47
47
|
|
|
@@ -70,9 +70,9 @@ class DateDiff(PandasOperand, DateDiffMixin):
|
|
|
70
70
|
|
|
71
71
|
|
|
72
72
|
class DateDiffType2(PandasOperand, DateDiffMixin):
|
|
73
|
-
name = "date_diff_type2"
|
|
74
|
-
is_binary = True
|
|
75
|
-
has_symmetry_importance = True
|
|
73
|
+
name: str = "date_diff_type2"
|
|
74
|
+
is_binary: bool = True
|
|
75
|
+
has_symmetry_importance: bool = True
|
|
76
76
|
|
|
77
77
|
def get_params(self) -> Dict[str, Optional[str]]:
|
|
78
78
|
res = super().get_params()
|
|
@@ -104,8 +104,8 @@ _count_aggregations = ["nunique", "count"]
|
|
|
104
104
|
|
|
105
105
|
|
|
106
106
|
class DateListDiff(PandasOperand, DateDiffMixin):
|
|
107
|
-
is_binary = True
|
|
108
|
-
has_symmetry_importance = True
|
|
107
|
+
is_binary: bool = True
|
|
108
|
+
has_symmetry_importance: bool = True
|
|
109
109
|
|
|
110
110
|
aggregation: str
|
|
111
111
|
replace_negative: bool = False
|
|
@@ -165,8 +165,8 @@ class DateListDiff(PandasOperand, DateDiffMixin):
|
|
|
165
165
|
|
|
166
166
|
|
|
167
167
|
class DateListDiffBounded(DateListDiff):
|
|
168
|
-
lower_bound: Optional[int]
|
|
169
|
-
upper_bound: Optional[int]
|
|
168
|
+
lower_bound: Optional[int] = None
|
|
169
|
+
upper_bound: Optional[int] = None
|
|
170
170
|
|
|
171
171
|
def __init__(self, **data: Any) -> None:
|
|
172
172
|
if "name" not in data:
|
|
@@ -191,8 +191,8 @@ class DateListDiffBounded(DateListDiff):
|
|
|
191
191
|
|
|
192
192
|
|
|
193
193
|
class DatePercentileBase(PandasOperand, abc.ABC):
|
|
194
|
-
is_binary = True
|
|
195
|
-
output_type = "float"
|
|
194
|
+
is_binary: bool = True
|
|
195
|
+
output_type: Optional[str] = "float"
|
|
196
196
|
|
|
197
197
|
date_unit: Optional[str] = None
|
|
198
198
|
|
|
@@ -226,12 +226,12 @@ class DatePercentileBase(PandasOperand, abc.ABC):
|
|
|
226
226
|
|
|
227
227
|
|
|
228
228
|
class DatePercentile(DatePercentileBase):
|
|
229
|
-
name = "date_per"
|
|
230
|
-
alias = "date_per_method1"
|
|
229
|
+
name: str = "date_per"
|
|
230
|
+
alias: Optional[str] = "date_per_method1"
|
|
231
231
|
|
|
232
|
-
zero_month: Optional[int]
|
|
233
|
-
zero_year: Optional[int]
|
|
234
|
-
zero_bounds: Optional[List[float]]
|
|
232
|
+
zero_month: Optional[int] = None
|
|
233
|
+
zero_year: Optional[int] = None
|
|
234
|
+
zero_bounds: Optional[List[float]] = None
|
|
235
235
|
step: int = 30
|
|
236
236
|
|
|
237
237
|
def get_params(self) -> Dict[str, Optional[str]]:
|
|
@@ -246,7 +246,7 @@ class DatePercentile(DatePercentileBase):
|
|
|
246
246
|
)
|
|
247
247
|
return res
|
|
248
248
|
|
|
249
|
-
@
|
|
249
|
+
@field_validator("zero_bounds", mode="before")
|
|
250
250
|
def validate_bounds(cls, value):
|
|
251
251
|
if value is None or isinstance(value, list):
|
|
252
252
|
return value
|
|
@@ -264,7 +264,7 @@ class DatePercentile(DatePercentileBase):
|
|
|
264
264
|
|
|
265
265
|
|
|
266
266
|
class DatePercentileMethod2(DatePercentileBase):
|
|
267
|
-
name = "date_per_method2"
|
|
267
|
+
name: str = "date_per_method2"
|
|
268
268
|
|
|
269
269
|
def _get_bounds(self, date_col: pd.Series) -> pd.Series:
|
|
270
270
|
pass
|
upgini/autofe/feature.py
CHANGED
|
@@ -82,9 +82,9 @@ class Feature:
|
|
|
82
82
|
self.alias = alias
|
|
83
83
|
|
|
84
84
|
def set_op_params(self, params: Optional[Dict[str, str]]) -> "Feature":
|
|
85
|
-
obj_dict = self.op.
|
|
85
|
+
obj_dict = self.op.model_dump().copy()
|
|
86
86
|
obj_dict.update(params or {})
|
|
87
|
-
self.op = self.op.__class__.
|
|
87
|
+
self.op = self.op.__class__.model_validate(obj_dict)
|
|
88
88
|
self.op.set_params(params)
|
|
89
89
|
|
|
90
90
|
for child in self.children:
|
upgini/autofe/groupby.py
CHANGED
|
@@ -7,9 +7,9 @@ from upgini.autofe.operand import PandasOperand, VectorizableMixin
|
|
|
7
7
|
|
|
8
8
|
class GroupByThenAgg(PandasOperand, VectorizableMixin):
|
|
9
9
|
agg: Optional[str]
|
|
10
|
-
is_vectorizable = True
|
|
11
|
-
is_grouping = True
|
|
12
|
-
is_distribution_dependent = True
|
|
10
|
+
is_vectorizable: bool = True
|
|
11
|
+
is_grouping: bool = True
|
|
12
|
+
is_distribution_dependent: bool = True
|
|
13
13
|
|
|
14
14
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
15
15
|
temp = left.groupby(right).agg(self.agg)
|
|
@@ -24,17 +24,17 @@ class GroupByThenAgg(PandasOperand, VectorizableMixin):
|
|
|
24
24
|
|
|
25
25
|
|
|
26
26
|
class GroupByThenMedian(GroupByThenAgg):
|
|
27
|
-
name = "GroupByThenMedian"
|
|
28
|
-
pandas_agg = "median"
|
|
29
|
-
is_distribution_dependent = True
|
|
27
|
+
name: str = "GroupByThenMedian"
|
|
28
|
+
pandas_agg: str = "median"
|
|
29
|
+
is_distribution_dependent: bool = True
|
|
30
30
|
|
|
31
31
|
|
|
32
32
|
class GroupByThenRank(PandasOperand, VectorizableMixin):
|
|
33
|
-
name = "GroupByThenRank"
|
|
34
|
-
is_vectorizable = True
|
|
35
|
-
is_grouping = True
|
|
36
|
-
output_type = "float"
|
|
37
|
-
is_distribution_dependent = True
|
|
33
|
+
name: str = "GroupByThenRank"
|
|
34
|
+
is_vectorizable: bool = True
|
|
35
|
+
is_grouping: bool = True
|
|
36
|
+
output_type: Optional[str] = "float"
|
|
37
|
+
is_distribution_dependent: bool = True
|
|
38
38
|
|
|
39
39
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
40
40
|
temp = pd.DataFrame(left[~right.isna()].groupby(right).rank(ascending=True, pct=True)).reset_index()
|
|
@@ -49,12 +49,12 @@ class GroupByThenRank(PandasOperand, VectorizableMixin):
|
|
|
49
49
|
|
|
50
50
|
|
|
51
51
|
class GroupByThenNUnique(PandasOperand, VectorizableMixin):
|
|
52
|
-
name = "GroupByThenNUnique"
|
|
53
|
-
is_vectorizable = True
|
|
54
|
-
is_grouping = True
|
|
55
|
-
output_type = "int"
|
|
56
|
-
is_distribution_dependent = True
|
|
57
|
-
input_type = "discrete"
|
|
52
|
+
name: str = "GroupByThenNUnique"
|
|
53
|
+
is_vectorizable: bool = True
|
|
54
|
+
is_grouping: bool = True
|
|
55
|
+
output_type: Optional[str] = "int"
|
|
56
|
+
is_distribution_dependent: bool = True
|
|
57
|
+
input_type: Optional[str] = "discrete"
|
|
58
58
|
|
|
59
59
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
60
60
|
nunique = left.groupby(right).nunique()
|
|
@@ -69,11 +69,11 @@ class GroupByThenNUnique(PandasOperand, VectorizableMixin):
|
|
|
69
69
|
|
|
70
70
|
|
|
71
71
|
class GroupByThenFreq(PandasOperand):
|
|
72
|
-
name = "GroupByThenFreq"
|
|
73
|
-
is_grouping = True
|
|
74
|
-
output_type = "float"
|
|
75
|
-
is_distribution_dependent = True
|
|
76
|
-
input_type = "discrete"
|
|
72
|
+
name: str = "GroupByThenFreq"
|
|
73
|
+
is_grouping: bool = True
|
|
74
|
+
output_type: Optional[str] = "float"
|
|
75
|
+
is_distribution_dependent: bool = True
|
|
76
|
+
input_type: Optional[str] = "discrete"
|
|
77
77
|
|
|
78
78
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
79
79
|
def _f(x):
|
upgini/autofe/operand.py
CHANGED
|
@@ -8,19 +8,19 @@ from pydantic import BaseModel
|
|
|
8
8
|
|
|
9
9
|
class Operand(BaseModel):
|
|
10
10
|
name: str
|
|
11
|
-
alias: Optional[str]
|
|
11
|
+
alias: Optional[str] = None
|
|
12
12
|
is_unary: bool = False
|
|
13
13
|
is_symmetrical: bool = False
|
|
14
14
|
has_symmetry_importance: bool = False
|
|
15
|
-
input_type: Optional[str]
|
|
16
|
-
output_type: Optional[str]
|
|
15
|
+
input_type: Optional[str] = None
|
|
16
|
+
output_type: Optional[str] = None
|
|
17
17
|
is_categorical: bool = False
|
|
18
18
|
is_vectorizable: bool = False
|
|
19
19
|
is_grouping: bool = False
|
|
20
20
|
is_binary: bool = False
|
|
21
21
|
is_vector: bool = False
|
|
22
22
|
is_distribution_dependent: bool = False
|
|
23
|
-
params: Optional[Dict[str, str]]
|
|
23
|
+
params: Optional[Dict[str, str]] = None
|
|
24
24
|
|
|
25
25
|
def set_params(self, params: Dict[str, str]):
|
|
26
26
|
self.params = params
|
upgini/autofe/unary.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
from typing import Optional
|
|
1
2
|
import numpy as np
|
|
2
3
|
import pandas as pd
|
|
3
4
|
from sklearn.preprocessing import Normalizer
|
|
@@ -6,10 +7,10 @@ from upgini.autofe.operand import PandasOperand, VectorizableMixin
|
|
|
6
7
|
|
|
7
8
|
|
|
8
9
|
class Abs(PandasOperand, VectorizableMixin):
|
|
9
|
-
name = "abs"
|
|
10
|
-
is_unary = True
|
|
11
|
-
is_vectorizable = True
|
|
12
|
-
group_index = 0
|
|
10
|
+
name: str = "abs"
|
|
11
|
+
is_unary: bool = True
|
|
12
|
+
is_vectorizable: bool = True
|
|
13
|
+
group_index: int = 0
|
|
13
14
|
|
|
14
15
|
def calculate_unary(self, data: pd.Series) -> pd.Series:
|
|
15
16
|
return data.abs()
|
|
@@ -19,11 +20,11 @@ class Abs(PandasOperand, VectorizableMixin):
|
|
|
19
20
|
|
|
20
21
|
|
|
21
22
|
class Log(PandasOperand, VectorizableMixin):
|
|
22
|
-
name = "log"
|
|
23
|
-
is_unary = True
|
|
24
|
-
is_vectorizable = True
|
|
25
|
-
output_type = "float"
|
|
26
|
-
group_index = 0
|
|
23
|
+
name: str = "log"
|
|
24
|
+
is_unary: bool = True
|
|
25
|
+
is_vectorizable: bool = True
|
|
26
|
+
output_type: Optional[str] = "float"
|
|
27
|
+
group_index: int = 0
|
|
27
28
|
|
|
28
29
|
def calculate_unary(self, data: pd.Series) -> pd.Series:
|
|
29
30
|
return self._round_value(np.log(np.abs(data.replace(0, np.nan))), 10)
|
|
@@ -33,11 +34,11 @@ class Log(PandasOperand, VectorizableMixin):
|
|
|
33
34
|
|
|
34
35
|
|
|
35
36
|
class Sqrt(PandasOperand, VectorizableMixin):
|
|
36
|
-
name = "sqrt"
|
|
37
|
-
is_unary = True
|
|
38
|
-
is_vectorizable = True
|
|
39
|
-
output_type = "float"
|
|
40
|
-
group_index = 0
|
|
37
|
+
name: str = "sqrt"
|
|
38
|
+
is_unary: bool = True
|
|
39
|
+
is_vectorizable: bool = True
|
|
40
|
+
output_type: Optional[str] = "float"
|
|
41
|
+
group_index: int = 0
|
|
41
42
|
|
|
42
43
|
def calculate_unary(self, data: pd.Series) -> pd.Series:
|
|
43
44
|
return self._round_value(np.sqrt(np.abs(data)))
|
|
@@ -47,10 +48,10 @@ class Sqrt(PandasOperand, VectorizableMixin):
|
|
|
47
48
|
|
|
48
49
|
|
|
49
50
|
class Square(PandasOperand, VectorizableMixin):
|
|
50
|
-
name = "square"
|
|
51
|
-
is_unary = True
|
|
52
|
-
is_vectorizable = True
|
|
53
|
-
group_index = 0
|
|
51
|
+
name: str = "square"
|
|
52
|
+
is_unary: bool = True
|
|
53
|
+
is_vectorizable: bool = True
|
|
54
|
+
group_index: int = 0
|
|
54
55
|
|
|
55
56
|
def calculate_unary(self, data: pd.Series) -> pd.Series:
|
|
56
57
|
return np.square(data)
|
|
@@ -60,11 +61,11 @@ class Square(PandasOperand, VectorizableMixin):
|
|
|
60
61
|
|
|
61
62
|
|
|
62
63
|
class Sigmoid(PandasOperand, VectorizableMixin):
|
|
63
|
-
name = "sigmoid"
|
|
64
|
-
is_unary = True
|
|
65
|
-
is_vectorizable = True
|
|
66
|
-
output_type = "float"
|
|
67
|
-
group_index = 0
|
|
64
|
+
name: str = "sigmoid"
|
|
65
|
+
is_unary: bool = True
|
|
66
|
+
is_vectorizable: bool = True
|
|
67
|
+
output_type: Optional[str] = "float"
|
|
68
|
+
group_index: int = 0
|
|
68
69
|
|
|
69
70
|
def calculate_unary(self, data: pd.Series) -> pd.Series:
|
|
70
71
|
return self._round_value(1 / (1 + np.exp(-data)))
|
|
@@ -74,12 +75,12 @@ class Sigmoid(PandasOperand, VectorizableMixin):
|
|
|
74
75
|
|
|
75
76
|
|
|
76
77
|
class Floor(PandasOperand, VectorizableMixin):
|
|
77
|
-
name = "floor"
|
|
78
|
-
is_unary = True
|
|
79
|
-
is_vectorizable = True
|
|
80
|
-
output_type = "int"
|
|
81
|
-
input_type = "continuous"
|
|
82
|
-
group_index = 0
|
|
78
|
+
name: str = "floor"
|
|
79
|
+
is_unary: bool = True
|
|
80
|
+
is_vectorizable: bool = True
|
|
81
|
+
output_type: Optional[str] = "int"
|
|
82
|
+
input_type: Optional[str] = "continuous"
|
|
83
|
+
group_index: int = 0
|
|
83
84
|
|
|
84
85
|
def calculate_unary(self, data: pd.Series) -> pd.Series:
|
|
85
86
|
return np.floor(data)
|
|
@@ -89,11 +90,11 @@ class Floor(PandasOperand, VectorizableMixin):
|
|
|
89
90
|
|
|
90
91
|
|
|
91
92
|
class Residual(PandasOperand, VectorizableMixin):
|
|
92
|
-
name = "residual"
|
|
93
|
-
is_unary = True
|
|
94
|
-
is_vectorizable = True
|
|
95
|
-
input_type = "continuous"
|
|
96
|
-
group_index = 0
|
|
93
|
+
name: str = "residual"
|
|
94
|
+
is_unary: bool = True
|
|
95
|
+
is_vectorizable: bool = True
|
|
96
|
+
input_type: Optional[str] = "continuous"
|
|
97
|
+
group_index: int = 0
|
|
97
98
|
|
|
98
99
|
def calculate_unary(self, data: pd.Series) -> pd.Series:
|
|
99
100
|
return data - np.floor(data)
|
|
@@ -103,11 +104,11 @@ class Residual(PandasOperand, VectorizableMixin):
|
|
|
103
104
|
|
|
104
105
|
|
|
105
106
|
class Freq(PandasOperand):
|
|
106
|
-
name = "freq"
|
|
107
|
-
is_unary = True
|
|
108
|
-
output_type = "float"
|
|
109
|
-
is_distribution_dependent = True
|
|
110
|
-
input_type = "discrete"
|
|
107
|
+
name: str = "freq"
|
|
108
|
+
is_unary: bool = True
|
|
109
|
+
output_type: Optional[str] = "float"
|
|
110
|
+
is_distribution_dependent: bool = True
|
|
111
|
+
input_type: Optional[str] = "discrete"
|
|
111
112
|
|
|
112
113
|
def calculate_unary(self, data: pd.Series) -> pd.Series:
|
|
113
114
|
value_counts = data.value_counts(normalize=True)
|
|
@@ -115,9 +116,9 @@ class Freq(PandasOperand):
|
|
|
115
116
|
|
|
116
117
|
|
|
117
118
|
class Norm(PandasOperand):
|
|
118
|
-
name = "norm"
|
|
119
|
-
is_unary = True
|
|
120
|
-
output_type = "float"
|
|
119
|
+
name: str = "norm"
|
|
120
|
+
is_unary: bool = True
|
|
121
|
+
output_type: Optional[str] = "float"
|
|
121
122
|
|
|
122
123
|
def calculate_unary(self, data: pd.Series) -> pd.Series:
|
|
123
124
|
data_dropna = data.dropna()
|
|
@@ -131,7 +132,7 @@ class Norm(PandasOperand):
|
|
|
131
132
|
|
|
132
133
|
|
|
133
134
|
class Embeddings(PandasOperand):
|
|
134
|
-
name = "emb"
|
|
135
|
-
is_unary = True
|
|
136
|
-
input_type = "string"
|
|
137
|
-
output_type = "vector"
|
|
135
|
+
name: str = "emb"
|
|
136
|
+
is_unary: bool = True
|
|
137
|
+
input_type: Optional[str] = "string"
|
|
138
|
+
output_type: Optional[str] = "vector"
|
upgini/autofe/vector.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import List
|
|
1
|
+
from typing import List, Optional
|
|
2
2
|
|
|
3
3
|
import pandas as pd
|
|
4
4
|
|
|
@@ -6,19 +6,19 @@ from upgini.autofe.operand import PandasOperand, VectorizableMixin
|
|
|
6
6
|
|
|
7
7
|
|
|
8
8
|
class Mean(PandasOperand, VectorizableMixin):
|
|
9
|
-
name = "mean"
|
|
10
|
-
output_type = "float"
|
|
11
|
-
is_vector = True
|
|
12
|
-
group_index = 0
|
|
9
|
+
name: str = "mean"
|
|
10
|
+
output_type: Optional[str] = "float"
|
|
11
|
+
is_vector: bool = True
|
|
12
|
+
group_index: int = 0
|
|
13
13
|
|
|
14
14
|
def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
|
|
15
15
|
return pd.DataFrame(data).T.fillna(0).mean(axis=1)
|
|
16
16
|
|
|
17
17
|
|
|
18
18
|
class Sum(PandasOperand, VectorizableMixin):
|
|
19
|
-
name = "sum"
|
|
20
|
-
is_vector = True
|
|
21
|
-
group_index = 0
|
|
19
|
+
name: str = "sum"
|
|
20
|
+
is_vector: bool = True
|
|
21
|
+
group_index: int = 0
|
|
22
22
|
|
|
23
23
|
def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
|
|
24
24
|
return pd.DataFrame(data).T.fillna(0).sum(axis=1)
|
upgini/features_enricher.py
CHANGED
|
@@ -23,7 +23,6 @@ from pandas.api.types import (
|
|
|
23
23
|
is_datetime64_any_dtype,
|
|
24
24
|
is_numeric_dtype,
|
|
25
25
|
is_object_dtype,
|
|
26
|
-
is_period_dtype,
|
|
27
26
|
is_string_dtype,
|
|
28
27
|
)
|
|
29
28
|
from scipy.stats import ks_2samp
|
|
@@ -1408,7 +1407,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1408
1407
|
# TODO maybe there is no more need for these convertions
|
|
1409
1408
|
# Remove datetime features
|
|
1410
1409
|
datetime_features = [
|
|
1411
|
-
f
|
|
1410
|
+
f
|
|
1411
|
+
for f in fitting_X.columns
|
|
1412
|
+
if is_datetime64_any_dtype(fitting_X[f]) or isinstance(fitting_X[f].dtype, pd.PeriodDtype)
|
|
1412
1413
|
]
|
|
1413
1414
|
if len(datetime_features) > 0:
|
|
1414
1415
|
self.logger.warning(self.bundle.get("dataset_date_features").format(datetime_features))
|
upgini/http.py
CHANGED
|
@@ -39,18 +39,6 @@ from upgini.metadata import (
|
|
|
39
39
|
from upgini.resource_bundle import bundle
|
|
40
40
|
from upgini.utils.track_info import get_track_metrics
|
|
41
41
|
|
|
42
|
-
# try:
|
|
43
|
-
# from importlib.metadata import version # type: ignore
|
|
44
|
-
|
|
45
|
-
# __version__ = version("upgini")
|
|
46
|
-
# except ImportError:
|
|
47
|
-
# try:
|
|
48
|
-
# from importlib_metadata import version # type: ignore
|
|
49
|
-
|
|
50
|
-
# __version__ = version("upgini")
|
|
51
|
-
# except ImportError:
|
|
52
|
-
# __version__ = "Upgini wasn't installed"
|
|
53
|
-
|
|
54
42
|
UPGINI_URL: str = "UPGINI_URL"
|
|
55
43
|
UPGINI_API_KEY: str = "UPGINI_API_KEY"
|
|
56
44
|
DEMO_API_KEY: str = "Aa4BPwGFbn1zNEXIkZ-NbhsRk0ricN6puKuga1-O5lM"
|
|
@@ -452,18 +440,18 @@ class _RestClient:
|
|
|
452
440
|
content = file.read()
|
|
453
441
|
md5_hash.update(content)
|
|
454
442
|
digest = md5_hash.hexdigest()
|
|
455
|
-
metadata_with_md5 = metadata.
|
|
443
|
+
metadata_with_md5 = metadata.model_copy(update={"checksumMD5": digest})
|
|
456
444
|
|
|
457
445
|
digest_sha256 = hashlib.sha256(
|
|
458
446
|
pd.util.hash_pandas_object(pd.read_parquet(file_path, engine="fastparquet")).values
|
|
459
447
|
).hexdigest()
|
|
460
|
-
metadata_with_md5 = metadata_with_md5.
|
|
448
|
+
metadata_with_md5 = metadata_with_md5.model_copy(update={"digest": digest_sha256})
|
|
461
449
|
|
|
462
450
|
with open(file_path, "rb") as file:
|
|
463
451
|
files = {
|
|
464
452
|
"metadata": (
|
|
465
453
|
"metadata.json",
|
|
466
|
-
metadata_with_md5.
|
|
454
|
+
metadata_with_md5.model_dump_json(exclude_none=True).encode(),
|
|
467
455
|
"application/json",
|
|
468
456
|
),
|
|
469
457
|
"tracking": (
|
|
@@ -471,13 +459,17 @@ class _RestClient:
|
|
|
471
459
|
dumps(track_metrics).encode(),
|
|
472
460
|
"application/json",
|
|
473
461
|
),
|
|
474
|
-
"metrics": (
|
|
462
|
+
"metrics": (
|
|
463
|
+
"metrics.json",
|
|
464
|
+
metrics.model_dump_json(exclude_none=True).encode(),
|
|
465
|
+
"application/json",
|
|
466
|
+
),
|
|
475
467
|
"file": (metadata_with_md5.name, file, "application/octet-stream"),
|
|
476
468
|
}
|
|
477
469
|
if search_customization is not None:
|
|
478
470
|
files["customization"] = (
|
|
479
471
|
"customization.json",
|
|
480
|
-
search_customization.
|
|
472
|
+
search_customization.model_dump_json(exclude_none=True).encode(),
|
|
481
473
|
"application/json",
|
|
482
474
|
)
|
|
483
475
|
additional_headers = {self.SEARCH_KEYS_HEADER_NAME: ",".join(self.search_keys_meaning_types(metadata))}
|
|
@@ -492,7 +484,7 @@ class _RestClient:
|
|
|
492
484
|
def check_uploaded_file_v2(self, trace_id: str, file_upload_id: str, metadata: FileMetadata) -> bool:
|
|
493
485
|
api_path = self.CHECK_UPLOADED_FILE_URL_FMT_V2.format(file_upload_id)
|
|
494
486
|
response = self._with_unauth_retry(
|
|
495
|
-
lambda: self._send_post_req(api_path, trace_id, metadata.
|
|
487
|
+
lambda: self._send_post_req(api_path, trace_id, metadata.model_dump_json(exclude_none=True))
|
|
496
488
|
)
|
|
497
489
|
return bool(response)
|
|
498
490
|
|
|
@@ -506,11 +498,11 @@ class _RestClient:
|
|
|
506
498
|
) -> SearchTaskResponse:
|
|
507
499
|
api_path = self.INITIAL_SEARCH_WITHOUT_UPLOAD_URI_FMT_V2.format(file_upload_id)
|
|
508
500
|
files = {
|
|
509
|
-
"metadata": ("metadata.json", metadata.
|
|
510
|
-
"metrics": ("metrics.json", metrics.
|
|
501
|
+
"metadata": ("metadata.json", metadata.model_dump_json(exclude_none=True).encode(), "application/json"),
|
|
502
|
+
"metrics": ("metrics.json", metrics.model_dump_json(exclude_none=True).encode(), "application/json"),
|
|
511
503
|
}
|
|
512
504
|
if search_customization is not None:
|
|
513
|
-
files["customization"] = search_customization.
|
|
505
|
+
files["customization"] = search_customization.model_dump_json(exclude_none=True).encode()
|
|
514
506
|
additional_headers = {self.SEARCH_KEYS_HEADER_NAME: ",".join(self.search_keys_meaning_types(metadata))}
|
|
515
507
|
response = self._with_unauth_retry(
|
|
516
508
|
lambda: self._send_post_file_req_v2(
|
|
@@ -536,18 +528,18 @@ class _RestClient:
|
|
|
536
528
|
content = file.read()
|
|
537
529
|
md5_hash.update(content)
|
|
538
530
|
digest = md5_hash.hexdigest()
|
|
539
|
-
metadata_with_md5 = metadata.
|
|
531
|
+
metadata_with_md5 = metadata.model_copy(update={"checksumMD5": digest})
|
|
540
532
|
|
|
541
533
|
digest_sha256 = hashlib.sha256(
|
|
542
534
|
pd.util.hash_pandas_object(pd.read_parquet(file_path, engine="fastparquet")).values
|
|
543
535
|
).hexdigest()
|
|
544
|
-
metadata_with_md5 = metadata_with_md5.
|
|
536
|
+
metadata_with_md5 = metadata_with_md5.model_copy(update={"digest": digest_sha256})
|
|
545
537
|
|
|
546
538
|
with open(file_path, "rb") as file:
|
|
547
539
|
files = {
|
|
548
540
|
"metadata": (
|
|
549
541
|
"metadata.json",
|
|
550
|
-
metadata_with_md5.
|
|
542
|
+
metadata_with_md5.model_dump_json(exclude_none=True).encode(),
|
|
551
543
|
"application/json",
|
|
552
544
|
),
|
|
553
545
|
"tracking": (
|
|
@@ -555,13 +547,17 @@ class _RestClient:
|
|
|
555
547
|
dumps(get_track_metrics(self.client_ip, self.client_visitorid)).encode(),
|
|
556
548
|
"application/json",
|
|
557
549
|
),
|
|
558
|
-
"metrics": (
|
|
550
|
+
"metrics": (
|
|
551
|
+
"metrics.json",
|
|
552
|
+
metrics.model_dump_json(exclude_none=True).encode(),
|
|
553
|
+
"application/json",
|
|
554
|
+
),
|
|
559
555
|
"file": (metadata_with_md5.name, file, "application/octet-stream"),
|
|
560
556
|
}
|
|
561
557
|
if search_customization is not None:
|
|
562
558
|
files["customization"] = (
|
|
563
559
|
"customization.json",
|
|
564
|
-
search_customization.
|
|
560
|
+
search_customization.model_dump_json(exclude_none=True).encode(),
|
|
565
561
|
"application/json",
|
|
566
562
|
)
|
|
567
563
|
|
|
@@ -585,11 +581,11 @@ class _RestClient:
|
|
|
585
581
|
) -> SearchTaskResponse:
|
|
586
582
|
api_path = self.VALIDATION_SEARCH_WITHOUT_UPLOAD_URI_FMT_V2.format(file_upload_id, initial_search_task_id)
|
|
587
583
|
files = {
|
|
588
|
-
"metadata": ("metadata.json", metadata.
|
|
589
|
-
"metrics": ("metrics.json", metrics.
|
|
584
|
+
"metadata": ("metadata.json", metadata.model_dump_json(exclude_none=True).encode(), "application/json"),
|
|
585
|
+
"metrics": ("metrics.json", metrics.model_dump_json(exclude_none=True).encode(), "application/json"),
|
|
590
586
|
}
|
|
591
587
|
if search_customization is not None:
|
|
592
|
-
files["customization"] = search_customization.
|
|
588
|
+
files["customization"] = search_customization.model_dump_json(exclude_none=True).encode()
|
|
593
589
|
additional_headers = {self.SEARCH_KEYS_HEADER_NAME: ",".join(self.search_keys_meaning_types(metadata))}
|
|
594
590
|
response = self._with_unauth_retry(
|
|
595
591
|
lambda: self._send_post_file_req_v2(
|
|
@@ -651,7 +647,11 @@ class _RestClient:
|
|
|
651
647
|
with open(file_path, "rb") as file:
|
|
652
648
|
files = {
|
|
653
649
|
"file": (metadata.name, file, "application/octet-stream"),
|
|
654
|
-
"metadata": (
|
|
650
|
+
"metadata": (
|
|
651
|
+
"metadata.json",
|
|
652
|
+
metadata.model_dump_json(exclude_none=True).encode(),
|
|
653
|
+
"application/json",
|
|
654
|
+
),
|
|
655
655
|
}
|
|
656
656
|
|
|
657
657
|
return self._send_post_file_req_v2(api_path, files)
|
|
@@ -661,12 +661,12 @@ class _RestClient:
|
|
|
661
661
|
def get_search_file_metadata(self, search_task_id: str, trace_id: str) -> FileMetadata:
|
|
662
662
|
api_path = self.SEARCH_FILE_METADATA_URI_FMT_V2.format(search_task_id)
|
|
663
663
|
response = self._with_unauth_retry(lambda: self._send_get_req(api_path, trace_id))
|
|
664
|
-
return FileMetadata.
|
|
664
|
+
return FileMetadata.model_validate(response)
|
|
665
665
|
|
|
666
666
|
def get_provider_search_metadata_v3(self, provider_search_task_id: str, trace_id: str) -> ProviderTaskMetadataV2:
|
|
667
667
|
api_path = self.SEARCH_TASK_METADATA_FMT_V3.format(provider_search_task_id)
|
|
668
668
|
response = self._with_unauth_retry(lambda: self._send_get_req(api_path, trace_id))
|
|
669
|
-
return ProviderTaskMetadataV2.
|
|
669
|
+
return ProviderTaskMetadataV2.model_validate(response)
|
|
670
670
|
|
|
671
671
|
def get_current_transform_usage(self, trace_id) -> TransformUsage:
|
|
672
672
|
track_metrics = get_track_metrics(self.client_ip, self.client_visitorid)
|
upgini/lazy_import.py
CHANGED
|
@@ -1,4 +1,6 @@
|
|
|
1
1
|
import importlib
|
|
2
|
+
import importlib.util
|
|
3
|
+
import importlib.machinery
|
|
2
4
|
|
|
3
5
|
|
|
4
6
|
class LazyImport:
|
|
@@ -10,7 +12,18 @@ class LazyImport:
|
|
|
10
12
|
|
|
11
13
|
def _load(self):
|
|
12
14
|
if self._module is None:
|
|
13
|
-
|
|
15
|
+
# Load module and save link to it
|
|
16
|
+
spec = importlib.util.find_spec(self.module_name)
|
|
17
|
+
if spec is None:
|
|
18
|
+
raise ImportError(f"Module {self.module_name} not found")
|
|
19
|
+
|
|
20
|
+
# Create module
|
|
21
|
+
self._module = importlib.util.module_from_spec(spec)
|
|
22
|
+
|
|
23
|
+
# Execute module
|
|
24
|
+
spec.loader.exec_module(self._module)
|
|
25
|
+
|
|
26
|
+
# Get class from module
|
|
14
27
|
self._class = getattr(self._module, self.class_name)
|
|
15
28
|
|
|
16
29
|
def __call__(self, *args, **kwargs):
|
upgini/metadata.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
from enum import Enum
|
|
4
|
-
from typing import Dict, List, Optional, Set, Union
|
|
4
|
+
from typing import Any, Dict, List, Optional, Set, Union
|
|
5
5
|
|
|
6
6
|
from pydantic import BaseModel
|
|
7
7
|
|
|
@@ -172,23 +172,23 @@ class FileMetricsInterval(BaseModel):
|
|
|
172
172
|
date_cut: float
|
|
173
173
|
count: float
|
|
174
174
|
valid_count: float
|
|
175
|
-
avg_target: Optional[float] # not for multiclass
|
|
176
|
-
avg_score_etalon: Optional[float]
|
|
175
|
+
avg_target: Optional[float] = None # not for multiclass
|
|
176
|
+
avg_score_etalon: Optional[float] = None
|
|
177
177
|
|
|
178
178
|
|
|
179
179
|
class FileMetrics(BaseModel):
|
|
180
180
|
# etalon metadata
|
|
181
|
-
task_type: Optional[ModelTaskType]
|
|
182
|
-
label: Optional[ModelLabelType]
|
|
183
|
-
count: Optional[int]
|
|
184
|
-
valid_count: Optional[int]
|
|
185
|
-
valid_rate: Optional[float]
|
|
186
|
-
avg_target: Optional[float]
|
|
187
|
-
metrics_binary_etalon: Optional[BinaryTask]
|
|
188
|
-
metrics_regression_etalon: Optional[RegressionTask]
|
|
189
|
-
metrics_multiclass_etalon: Optional[MulticlassTask]
|
|
190
|
-
cuts: Optional[List[float]]
|
|
191
|
-
interval: Optional[List[FileMetricsInterval]]
|
|
181
|
+
task_type: Optional[ModelTaskType] = None
|
|
182
|
+
label: Optional[ModelLabelType] = None
|
|
183
|
+
count: Optional[int] = None
|
|
184
|
+
valid_count: Optional[int] = None
|
|
185
|
+
valid_rate: Optional[float] = None
|
|
186
|
+
avg_target: Optional[float] = None
|
|
187
|
+
metrics_binary_etalon: Optional[BinaryTask] = None
|
|
188
|
+
metrics_regression_etalon: Optional[RegressionTask] = None
|
|
189
|
+
metrics_multiclass_etalon: Optional[MulticlassTask] = None
|
|
190
|
+
cuts: Optional[List[float]] = None
|
|
191
|
+
interval: Optional[List[FileMetricsInterval]] = None
|
|
192
192
|
|
|
193
193
|
|
|
194
194
|
class NumericInterval(BaseModel):
|
|
@@ -202,25 +202,25 @@ class FileColumnMetadata(BaseModel):
|
|
|
202
202
|
dataType: DataType
|
|
203
203
|
meaningType: FileColumnMeaningType
|
|
204
204
|
minMaxValues: Optional[NumericInterval] = None
|
|
205
|
-
originalName: Optional[str]
|
|
205
|
+
originalName: Optional[str] = None
|
|
206
206
|
# is this column contains keys from multiple key columns like msisdn1, msisdn2
|
|
207
207
|
isUnnest: bool = False
|
|
208
208
|
# list of original etalon key column names like msisdn1, msisdn2
|
|
209
|
-
unnestKeyNames: Optional[List[str]]
|
|
209
|
+
unnestKeyNames: Optional[List[str]] = None
|
|
210
210
|
|
|
211
211
|
|
|
212
212
|
class FileMetadata(BaseModel):
|
|
213
213
|
name: str
|
|
214
|
-
description: Optional[str]
|
|
214
|
+
description: Optional[str] = None
|
|
215
215
|
columns: List[FileColumnMetadata]
|
|
216
216
|
searchKeys: List[List[str]]
|
|
217
|
-
excludeFeaturesSources: Optional[List[str]]
|
|
218
|
-
hierarchicalGroupKeys: Optional[List[str]]
|
|
219
|
-
hierarchicalSubgroupKeys: Optional[List[str]]
|
|
220
|
-
taskType: Optional[ModelTaskType]
|
|
221
|
-
rowsCount: Optional[int]
|
|
222
|
-
checksumMD5: Optional[str]
|
|
223
|
-
digest: Optional[str]
|
|
217
|
+
excludeFeaturesSources: Optional[List[str]] = None
|
|
218
|
+
hierarchicalGroupKeys: Optional[List[str]] = None
|
|
219
|
+
hierarchicalSubgroupKeys: Optional[List[str]] = None
|
|
220
|
+
taskType: Optional[ModelTaskType] = None
|
|
221
|
+
rowsCount: Optional[int] = None
|
|
222
|
+
checksumMD5: Optional[str] = None
|
|
223
|
+
digest: Optional[str] = None
|
|
224
224
|
|
|
225
225
|
def column_by_name(self, name: str) -> Optional[FileColumnMetadata]:
|
|
226
226
|
for c in self.columns:
|
|
@@ -244,17 +244,17 @@ class FeaturesMetadataV2(BaseModel):
|
|
|
244
244
|
source: str
|
|
245
245
|
hit_rate: float
|
|
246
246
|
shap_value: float
|
|
247
|
-
commercial_schema: Optional[str]
|
|
248
|
-
data_provider: Optional[str]
|
|
249
|
-
data_providers: Optional[List[str]]
|
|
250
|
-
data_provider_link: Optional[str]
|
|
251
|
-
data_provider_links: Optional[List[str]]
|
|
252
|
-
data_source: Optional[str]
|
|
253
|
-
data_sources: Optional[List[str]]
|
|
254
|
-
data_source_link: Optional[str]
|
|
255
|
-
data_source_links: Optional[List[str]]
|
|
256
|
-
doc_link: Optional[str]
|
|
257
|
-
update_frequency: Optional[str]
|
|
247
|
+
commercial_schema: Optional[str] = None
|
|
248
|
+
data_provider: Optional[str] = None
|
|
249
|
+
data_providers: Optional[List[str]] = None
|
|
250
|
+
data_provider_link: Optional[str] = None
|
|
251
|
+
data_provider_links: Optional[List[str]] = None
|
|
252
|
+
data_source: Optional[str] = None
|
|
253
|
+
data_sources: Optional[List[str]] = None
|
|
254
|
+
data_source_link: Optional[str] = None
|
|
255
|
+
data_source_links: Optional[List[str]] = None
|
|
256
|
+
doc_link: Optional[str] = None
|
|
257
|
+
update_frequency: Optional[str] = None
|
|
258
258
|
|
|
259
259
|
|
|
260
260
|
class HitRateMetrics(BaseModel):
|
|
@@ -274,48 +274,48 @@ class ModelEvalSet(BaseModel):
|
|
|
274
274
|
class BaseColumnMetadata(BaseModel):
|
|
275
275
|
original_name: str
|
|
276
276
|
hashed_name: str
|
|
277
|
-
ads_definition_id: Optional[str]
|
|
277
|
+
ads_definition_id: Optional[str] = None
|
|
278
278
|
is_augmented: bool
|
|
279
279
|
|
|
280
280
|
|
|
281
281
|
class GeneratedFeatureMetadata(BaseModel):
|
|
282
|
-
alias: Optional[str]
|
|
282
|
+
alias: Optional[str] = None
|
|
283
283
|
formula: str
|
|
284
284
|
display_index: str
|
|
285
285
|
base_columns: List[BaseColumnMetadata]
|
|
286
|
-
operator_params: Optional[Dict[str, str]]
|
|
286
|
+
operator_params: Optional[Dict[str, str]] = None
|
|
287
287
|
|
|
288
288
|
|
|
289
289
|
class ProviderTaskMetadataV2(BaseModel):
|
|
290
290
|
features: List[FeaturesMetadataV2]
|
|
291
|
-
hit_rate_metrics: Optional[HitRateMetrics]
|
|
292
|
-
eval_set_metrics: Optional[List[ModelEvalSet]]
|
|
293
|
-
zero_hit_rate_search_keys: Optional[List[str]]
|
|
294
|
-
features_used_for_embeddings: Optional[List[str]]
|
|
295
|
-
shuffle_kfold: Optional[bool]
|
|
296
|
-
generated_features: Optional[List[GeneratedFeatureMetadata]]
|
|
291
|
+
hit_rate_metrics: Optional[HitRateMetrics] = None
|
|
292
|
+
eval_set_metrics: Optional[List[ModelEvalSet]] = None
|
|
293
|
+
zero_hit_rate_search_keys: Optional[List[str]] = None
|
|
294
|
+
features_used_for_embeddings: Optional[List[str]] = None
|
|
295
|
+
shuffle_kfold: Optional[bool] = None
|
|
296
|
+
generated_features: Optional[List[GeneratedFeatureMetadata]] = None
|
|
297
297
|
|
|
298
298
|
|
|
299
299
|
class FeaturesFilter(BaseModel):
|
|
300
|
-
minImportance: Optional[float]
|
|
301
|
-
maxPSI: Optional[float]
|
|
302
|
-
maxCount: Optional[int]
|
|
303
|
-
selectedFeatures: Optional[List[str]]
|
|
300
|
+
minImportance: Optional[float] = None
|
|
301
|
+
maxPSI: Optional[float] = None
|
|
302
|
+
maxCount: Optional[int] = None
|
|
303
|
+
selectedFeatures: Optional[List[str]] = None
|
|
304
304
|
|
|
305
305
|
|
|
306
306
|
class RuntimeParameters(BaseModel):
|
|
307
|
-
properties: Dict[str,
|
|
307
|
+
properties: Dict[str, Any] = {}
|
|
308
308
|
|
|
309
309
|
|
|
310
310
|
class SearchCustomization(BaseModel):
|
|
311
|
-
featuresFilter: Optional[FeaturesFilter]
|
|
312
|
-
extractFeatures: Optional[bool]
|
|
313
|
-
accurateModel: Optional[bool]
|
|
314
|
-
importanceThreshold: Optional[float]
|
|
315
|
-
maxFeatures: Optional[int]
|
|
316
|
-
returnScores: Optional[bool]
|
|
317
|
-
runtimeParameters: Optional[RuntimeParameters]
|
|
318
|
-
metricsCalculation: Optional[bool]
|
|
311
|
+
featuresFilter: Optional[FeaturesFilter] = None
|
|
312
|
+
extractFeatures: Optional[bool] = None
|
|
313
|
+
accurateModel: Optional[bool] = None
|
|
314
|
+
importanceThreshold: Optional[float] = None
|
|
315
|
+
maxFeatures: Optional[int] = None
|
|
316
|
+
returnScores: Optional[bool] = None
|
|
317
|
+
runtimeParameters: Optional[RuntimeParameters] = None
|
|
318
|
+
metricsCalculation: Optional[bool] = None
|
|
319
319
|
|
|
320
320
|
def __repr__(self):
|
|
321
321
|
return (
|
|
@@ -10,7 +10,6 @@ from pandas.api.types import (
|
|
|
10
10
|
is_float_dtype,
|
|
11
11
|
is_numeric_dtype,
|
|
12
12
|
is_object_dtype,
|
|
13
|
-
is_period_dtype,
|
|
14
13
|
is_string_dtype,
|
|
15
14
|
)
|
|
16
15
|
|
|
@@ -135,7 +134,7 @@ class Normalizer:
|
|
|
135
134
|
|
|
136
135
|
removed_features = []
|
|
137
136
|
for f in features:
|
|
138
|
-
if is_datetime(df[f]) or
|
|
137
|
+
if is_datetime(df[f]) or isinstance(df[f].dtype, pd.PeriodDtype):
|
|
139
138
|
removed_features.append(f)
|
|
140
139
|
df.drop(columns=f, inplace=True)
|
|
141
140
|
|
upgini/utils/datetime_utils.py
CHANGED
|
@@ -6,7 +6,7 @@ from typing import Dict, List, Optional
|
|
|
6
6
|
import numpy as np
|
|
7
7
|
import pandas as pd
|
|
8
8
|
from dateutil.relativedelta import relativedelta
|
|
9
|
-
from pandas.api.types import is_numeric_dtype
|
|
9
|
+
from pandas.api.types import is_numeric_dtype
|
|
10
10
|
|
|
11
11
|
from upgini.errors import ValidationError
|
|
12
12
|
from upgini.metadata import EVAL_SET_INDEX, SearchKey
|
|
@@ -84,7 +84,7 @@ class DateTimeSearchKeyConverter:
|
|
|
84
84
|
df[self.date_column] = df[self.date_column].apply(lambda x: x.replace(tzinfo=None))
|
|
85
85
|
elif isinstance(df[self.date_column].values[0], datetime.date):
|
|
86
86
|
df[self.date_column] = pd.to_datetime(df[self.date_column], errors="coerce")
|
|
87
|
-
elif
|
|
87
|
+
elif isinstance(df[self.date_column].dtype, pd.PeriodDtype):
|
|
88
88
|
df[self.date_column] = df[self.date_column].dt.to_timestamp()
|
|
89
89
|
elif is_numeric_dtype(df[self.date_column]):
|
|
90
90
|
# 315532801 - 2524608001 - seconds
|
|
@@ -207,7 +207,7 @@ def is_time_series(df: pd.DataFrame, date_col: str) -> bool:
|
|
|
207
207
|
def is_blocked_time_series(df: pd.DataFrame, date_col: str, search_keys: List[str]) -> bool:
|
|
208
208
|
df = df.copy()
|
|
209
209
|
seconds = "datetime_seconds"
|
|
210
|
-
if
|
|
210
|
+
if isinstance(df[date_col].dtype, pd.PeriodDtype):
|
|
211
211
|
df[date_col] = df[date_col].dt.to_timestamp()
|
|
212
212
|
else:
|
|
213
213
|
df[date_col] = pd.to_datetime(df[date_col])
|
|
@@ -275,7 +275,7 @@ def validate_dates_distribution(
|
|
|
275
275
|
if col in search_keys:
|
|
276
276
|
continue
|
|
277
277
|
try:
|
|
278
|
-
if
|
|
278
|
+
if isinstance(X[col].dtype, pd.PeriodDtype):
|
|
279
279
|
pass
|
|
280
280
|
elif pd.__version__ >= "2.0.0":
|
|
281
281
|
# Format mixed to avoid massive warnings
|
|
@@ -290,7 +290,7 @@ def validate_dates_distribution(
|
|
|
290
290
|
if maybe_date_col is None:
|
|
291
291
|
return
|
|
292
292
|
|
|
293
|
-
if
|
|
293
|
+
if isinstance(X[maybe_date_col].dtype, pd.PeriodDtype):
|
|
294
294
|
dates = X[maybe_date_col].dt.to_timestamp().dt.date
|
|
295
295
|
elif pd.__version__ >= "2.0.0":
|
|
296
296
|
dates = pd.to_datetime(X[maybe_date_col], format="mixed").dt.date
|
upgini/utils/phone_utils.py
CHANGED
|
@@ -1,12 +1,8 @@
|
|
|
1
1
|
from typing import Optional
|
|
2
2
|
|
|
3
|
+
import numpy as np
|
|
3
4
|
import pandas as pd
|
|
4
|
-
from pandas.api.types import
|
|
5
|
-
is_float_dtype,
|
|
6
|
-
is_int64_dtype,
|
|
7
|
-
is_object_dtype,
|
|
8
|
-
is_string_dtype,
|
|
9
|
-
)
|
|
5
|
+
from pandas.api.types import is_float_dtype, is_object_dtype, is_string_dtype
|
|
10
6
|
|
|
11
7
|
from upgini.errors import ValidationError
|
|
12
8
|
from upgini.utils.base_search_key_detector import BaseSearchKeyDetector
|
|
@@ -63,7 +59,9 @@ class PhoneSearchKeyConverter:
|
|
|
63
59
|
convert_func = self.phone_str_to_int_safe
|
|
64
60
|
elif is_float_dtype(df[self.phone_column]):
|
|
65
61
|
convert_func = self.phone_float_to_int_safe
|
|
66
|
-
elif
|
|
62
|
+
elif df[self.phone_column].dtype == np.int64 or isinstance(
|
|
63
|
+
df[self.phone_column].dtype, pd.Int64Dtype
|
|
64
|
+
):
|
|
67
65
|
convert_func = self.phone_int_to_int_safe
|
|
68
66
|
else:
|
|
69
67
|
raise ValidationError(
|
upgini/utils/target_utils.py
CHANGED
|
@@ -194,4 +194,7 @@ def calculate_psi(expected: pd.Series, actual: pd.Series) -> float:
|
|
|
194
194
|
test_distribution = actual.value_counts(bins=bins, normalize=True).sort_index().values
|
|
195
195
|
|
|
196
196
|
# Calculate the PSI
|
|
197
|
-
|
|
197
|
+
try:
|
|
198
|
+
return np.sum((train_distribution - test_distribution) * np.log(train_distribution / test_distribution))
|
|
199
|
+
except Exception:
|
|
200
|
+
return np.nan
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: upgini
|
|
3
|
-
Version: 1.1.
|
|
3
|
+
Version: 1.1.316a2
|
|
4
4
|
Summary: Intelligent data search & enrichment for Machine Learning
|
|
5
5
|
Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
|
|
6
6
|
Project-URL: Homepage, https://upgini.com/
|
|
@@ -29,9 +29,9 @@ Requires-Dist: ipywidgets>=8.1.0
|
|
|
29
29
|
Requires-Dist: jarowinkler>=2.0.0
|
|
30
30
|
Requires-Dist: levenshtein>=0.25.1
|
|
31
31
|
Requires-Dist: lightgbm>=3.3.2
|
|
32
|
-
Requires-Dist: numpy
|
|
32
|
+
Requires-Dist: numpy<=1.26.4,>=1.19.0
|
|
33
33
|
Requires-Dist: pandas<3.0.0,>=1.1.0
|
|
34
|
-
Requires-Dist: pydantic<
|
|
34
|
+
Requires-Dist: pydantic<3.0.0,>1.0.0
|
|
35
35
|
Requires-Dist: pyjwt>=2.8.0
|
|
36
36
|
Requires-Dist: python-bidi==0.4.2
|
|
37
37
|
Requires-Dist: python-dateutil>=2.8.0
|
|
@@ -1,12 +1,12 @@
|
|
|
1
|
-
upgini/__about__.py,sha256=
|
|
1
|
+
upgini/__about__.py,sha256=lnBx1YP_mYM1XVyjFPE_kJqGG8UiGn3zcRQt1R0zSbY,26
|
|
2
2
|
upgini/__init__.py,sha256=Xs0YFVBu1KUdtZzbStGRPQtLt3YLzJnjx5nIUBlX8BE,415
|
|
3
3
|
upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
|
|
4
4
|
upgini/dataset.py,sha256=yAWIygHejxdKXOA4g3QjtCu0VRa9at-4nPPuugCr77U,30857
|
|
5
5
|
upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
|
|
6
|
-
upgini/features_enricher.py,sha256=
|
|
7
|
-
upgini/http.py,sha256=
|
|
8
|
-
upgini/lazy_import.py,sha256=
|
|
9
|
-
upgini/metadata.py,sha256=
|
|
6
|
+
upgini/features_enricher.py,sha256=_d8ya5RRoYN0o6mV6gda-bLdOngQ4rb1SA51SlM_TG0,188002
|
|
7
|
+
upgini/http.py,sha256=gCN5ru_I6JNHk-m6-Ckjhd23iMzOAzDSLb0tSEcxkC4,43068
|
|
8
|
+
upgini/lazy_import.py,sha256=74gQ8JuA48BGRLxAo7lNHNKY2D2emMxrUxKGdxVGhuY,1012
|
|
9
|
+
upgini/metadata.py,sha256=osmzdNESeh7yP3BZday6N9Q3eaIHfzhhRM1d6NSgcf0,11223
|
|
10
10
|
upgini/metrics.py,sha256=Tu5cN8RlhOSSMWUTXRSkdl8SWBqR1N_2eJpBum9pZxc,30926
|
|
11
11
|
upgini/search_task.py,sha256=LtRJ9bCPjMo1gJ-sUDKERhDwGcWKImrzwVFHjkMSQHQ,17071
|
|
12
12
|
upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
|
|
@@ -15,19 +15,19 @@ upgini/ads_management/__init__.py,sha256=qzyisOToVRP-tquAJD1PblZhNtMrOB8FiyF9Jvf
|
|
|
15
15
|
upgini/ads_management/ads_manager.py,sha256=igVbN2jz80Umb2BUJixmJVj-zx8unoKpecVo-R-nGdw,2648
|
|
16
16
|
upgini/autofe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
17
17
|
upgini/autofe/all_operands.py,sha256=3LiH9iU-ArGmYpS8FHWH7yCFx40ILfvlSXJlKIa75BQ,2542
|
|
18
|
-
upgini/autofe/binary.py,sha256=
|
|
19
|
-
upgini/autofe/date.py,sha256=
|
|
20
|
-
upgini/autofe/feature.py,sha256=
|
|
21
|
-
upgini/autofe/groupby.py,sha256=
|
|
22
|
-
upgini/autofe/operand.py,sha256=
|
|
23
|
-
upgini/autofe/unary.py,sha256=
|
|
24
|
-
upgini/autofe/vector.py,sha256=
|
|
18
|
+
upgini/autofe/binary.py,sha256=xRBT7RNqQ7pprz6cRpO1KnvZCb7PvU3QXBfaP6Omqi4,7425
|
|
19
|
+
upgini/autofe/date.py,sha256=eLPrO2Cgm74VB4rPtIaeUDuI5vmLiGnygHSvU4aGHWU,9223
|
|
20
|
+
upgini/autofe/feature.py,sha256=CivPkE7YrAtDrgF8WhVPnDAnNDR8gbRQ-8_hXiQE6ew,14234
|
|
21
|
+
upgini/autofe/groupby.py,sha256=r-xl_keZZgm_tpiEoDhjYSkT6NHv7a4cRQR4wJ4uCp8,3263
|
|
22
|
+
upgini/autofe/operand.py,sha256=uk883RaNqgXqtkaRqA1re1d9OFnnpv0JVvelYx09Yw0,2943
|
|
23
|
+
upgini/autofe/unary.py,sha256=RiK-Fz3fgjPlqWWfro6x7qChjEZ8W8RTnl5-MT1kaQA,4218
|
|
24
|
+
upgini/autofe/vector.py,sha256=ehcZUDqV71TfbU8EmKfdYp603gS2dJY_-fpr10ho5sI,663
|
|
25
25
|
upgini/data_source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
26
26
|
upgini/data_source/data_source_publisher.py,sha256=Vg0biG86YB0OEaoxbK9YYrr4yARm11_h3bTWIBgoScA,22115
|
|
27
27
|
upgini/mdc/__init__.py,sha256=aM08nIWFc2gWdWUa3_IuEnNND0cQPkBGnYpRMnfFN8k,1019
|
|
28
28
|
upgini/mdc/context.py,sha256=3u1B-jXt7tXEvNcV3qmR9SDCseudnY7KYsLclBdwVLk,1405
|
|
29
29
|
upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
30
|
-
upgini/normalizer/normalize_utils.py,sha256=
|
|
30
|
+
upgini/normalizer/normalize_utils.py,sha256=bHRPWCNrUvt2R9qMX6dZFCJ0i8ENVCQ2Rw3dHH9IJEg,7447
|
|
31
31
|
upgini/resource_bundle/__init__.py,sha256=S5F2G47pnJd2LDpmFsjDqEwiKkP8Hm-hcseDbMka6Ko,8345
|
|
32
32
|
upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
|
|
33
33
|
upgini/resource_bundle/strings.properties,sha256=WZAuYPX2Dpn6BHoA3RX8uvMNMr-yJE2fF7Gz0i24x2s,26459
|
|
@@ -42,7 +42,7 @@ upgini/utils/blocked_time_series.py,sha256=Uqr3vp4YqNclj2-PzEYqVy763GSXHn86sbpIl
|
|
|
42
42
|
upgini/utils/country_utils.py,sha256=lY-eXWwFVegdVENFttbvLcgGDjFO17Sex8hd2PyJaRk,6937
|
|
43
43
|
upgini/utils/custom_loss_utils.py,sha256=kieNZYBYZm5ZGBltF1F_jOSF4ea6C29rYuCyiDcqVNY,3857
|
|
44
44
|
upgini/utils/cv_utils.py,sha256=w6FQb9nO8BWDx88EF83NpjPLarK4eR4ia0Wg0kLBJC4,3525
|
|
45
|
-
upgini/utils/datetime_utils.py,sha256=
|
|
45
|
+
upgini/utils/datetime_utils.py,sha256=4tsGeehU0KS6wqNsc9gEEWZ9s6T9E0UReUIO3rSuXNU,12174
|
|
46
46
|
upgini/utils/deduplicate_utils.py,sha256=Zvs7zW4QzaERQmJNPrTVf2ZTVBkBLOycFCzyMwtXuV8,8770
|
|
47
47
|
upgini/utils/display_utils.py,sha256=A2ouB5eiZ-Kyt9ykYxkLQwyoRPrdYeJymwNTiajtFXs,10990
|
|
48
48
|
upgini/utils/email_utils.py,sha256=j0Ug1R_0AnCg1Y92zIZ4XMwvKo3G5_pcOlBN1OH_gZs,5191
|
|
@@ -50,14 +50,14 @@ upgini/utils/fallback_progress_bar.py,sha256=PDaKb8dYpVZaWMroNcOHsTc3pSjgi9mOm0-
|
|
|
50
50
|
upgini/utils/features_validator.py,sha256=PgKNt5dyqfErTvjtRNNUS9g7GFqHBtAtnsfA-V5UO1A,3307
|
|
51
51
|
upgini/utils/format.py,sha256=Yv5cvvSs2bOLUzzNu96Pu33VMDNbabio92QepUj41jU,243
|
|
52
52
|
upgini/utils/ip_utils.py,sha256=ZZj_uQFTHhagzt-MRew__ZBOp2DdnkMrachS7PElkSE,5143
|
|
53
|
-
upgini/utils/phone_utils.py,sha256=
|
|
53
|
+
upgini/utils/phone_utils.py,sha256=IrbztLuOJBiePqqxllfABWfYlfAjYevPhXKipl95wUI,10432
|
|
54
54
|
upgini/utils/postal_code_utils.py,sha256=C899tJS8qM_ps4I3g-Ve6qzIa22O_UqwNmGFoyy9sO8,1716
|
|
55
55
|
upgini/utils/progress_bar.py,sha256=N-Sfdah2Hg8lXP_fV9EfUTXz_PyRt4lo9fAHoUDOoLc,1550
|
|
56
56
|
upgini/utils/sklearn_ext.py,sha256=13jQS_k7v0aUtudXV6nGUEWjttPQzAW9AFYL5wgEz9k,44511
|
|
57
|
-
upgini/utils/target_utils.py,sha256=
|
|
57
|
+
upgini/utils/target_utils.py,sha256=BVtDmrmFMKerSUWaNOIEdzsYHIFiODdpnWbE50QDPDc,7864
|
|
58
58
|
upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
|
|
59
59
|
upgini/utils/warning_counter.py,sha256=dIWBB4dI5XRRJZudvIlqlIYKEiwLLPcXarsZuYRt338,227
|
|
60
|
-
upgini-1.1.
|
|
61
|
-
upgini-1.1.
|
|
62
|
-
upgini-1.1.
|
|
63
|
-
upgini-1.1.
|
|
60
|
+
upgini-1.1.316a2.dist-info/METADATA,sha256=h4ZR7hMkhteC4WNoyFJ7dXVTpUJyphWHa8cKmhP5BEQ,48232
|
|
61
|
+
upgini-1.1.316a2.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
|
|
62
|
+
upgini-1.1.316a2.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
|
63
|
+
upgini-1.1.316a2.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|