upgini 1.1.316__py3-none-any.whl → 1.1.316a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/__about__.py +1 -1
- upgini/autofe/binary.py +72 -75
- upgini/autofe/date.py +21 -21
- upgini/autofe/feature.py +2 -2
- upgini/autofe/groupby.py +22 -22
- upgini/autofe/operand.py +4 -4
- upgini/autofe/unary.py +47 -46
- upgini/autofe/vector.py +8 -8
- upgini/features_enricher.py +3 -2
- upgini/http.py +20 -31
- upgini/lazy_import.py +14 -1
- upgini/metadata.py +57 -57
- upgini/normalizer/normalize_utils.py +1 -2
- upgini/utils/datetime_utils.py +5 -5
- upgini/utils/phone_utils.py +5 -7
- upgini/utils/target_utils.py +4 -1
- {upgini-1.1.316.dist-info → upgini-1.1.316a1.dist-info}/METADATA +3 -3
- {upgini-1.1.316.dist-info → upgini-1.1.316a1.dist-info}/RECORD +20 -20
- {upgini-1.1.316.dist-info → upgini-1.1.316a1.dist-info}/WHEEL +0 -0
- {upgini-1.1.316.dist-info → upgini-1.1.316a1.dist-info}/licenses/LICENSE +0 -0
upgini/__about__.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "1.1.
|
|
1
|
+
__version__ = "1.1.316a1"
|
upgini/autofe/binary.py
CHANGED
|
@@ -9,32 +9,32 @@ from upgini.autofe.operand import PandasOperand, VectorizableMixin
|
|
|
9
9
|
|
|
10
10
|
|
|
11
11
|
class Min(PandasOperand):
|
|
12
|
-
name = "min"
|
|
13
|
-
is_binary = True
|
|
14
|
-
is_symmetrical = True
|
|
15
|
-
has_symmetry_importance = True
|
|
12
|
+
name: str = "min"
|
|
13
|
+
is_binary: bool = True
|
|
14
|
+
is_symmetrical: bool = True
|
|
15
|
+
has_symmetry_importance: bool = True
|
|
16
16
|
|
|
17
17
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
18
18
|
return np.minimum(left, right)
|
|
19
19
|
|
|
20
20
|
|
|
21
21
|
class Max(PandasOperand):
|
|
22
|
-
name = "max"
|
|
23
|
-
is_binary = True
|
|
24
|
-
is_symmetrical = True
|
|
25
|
-
has_symmetry_importance = True
|
|
22
|
+
name: str = "max"
|
|
23
|
+
is_binary: bool = True
|
|
24
|
+
is_symmetrical: bool = True
|
|
25
|
+
has_symmetry_importance: bool = True
|
|
26
26
|
|
|
27
27
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
28
28
|
return np.maximum(left, right)
|
|
29
29
|
|
|
30
30
|
|
|
31
31
|
class Add(PandasOperand, VectorizableMixin):
|
|
32
|
-
name = "+"
|
|
33
|
-
alias = "add"
|
|
34
|
-
is_binary = True
|
|
35
|
-
is_symmetrical = True
|
|
36
|
-
has_symmetry_importance = True
|
|
37
|
-
is_vectorizable = True
|
|
32
|
+
name: str = "+"
|
|
33
|
+
alias: str = "add"
|
|
34
|
+
is_binary: bool = True
|
|
35
|
+
is_symmetrical: bool = True
|
|
36
|
+
has_symmetry_importance: bool = True
|
|
37
|
+
is_vectorizable: bool = True
|
|
38
38
|
|
|
39
39
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
40
40
|
return left + right
|
|
@@ -48,12 +48,12 @@ class Add(PandasOperand, VectorizableMixin):
|
|
|
48
48
|
|
|
49
49
|
|
|
50
50
|
class Subtract(PandasOperand, VectorizableMixin):
|
|
51
|
-
name = "-"
|
|
52
|
-
alias = "sub"
|
|
53
|
-
is_binary = True
|
|
54
|
-
is_symmetrical = True
|
|
55
|
-
has_symmetry_importance = True
|
|
56
|
-
is_vectorizable = True
|
|
51
|
+
name: str = "-"
|
|
52
|
+
alias: str = "sub"
|
|
53
|
+
is_binary: bool = True
|
|
54
|
+
is_symmetrical: bool = True
|
|
55
|
+
has_symmetry_importance: bool = True
|
|
56
|
+
is_vectorizable: bool = True
|
|
57
57
|
|
|
58
58
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
59
59
|
return left - right
|
|
@@ -67,12 +67,12 @@ class Subtract(PandasOperand, VectorizableMixin):
|
|
|
67
67
|
|
|
68
68
|
|
|
69
69
|
class Multiply(PandasOperand, VectorizableMixin):
|
|
70
|
-
name = "*"
|
|
71
|
-
alias = "mul"
|
|
72
|
-
is_binary = True
|
|
73
|
-
is_symmetrical = True
|
|
74
|
-
has_symmetry_importance = True
|
|
75
|
-
is_vectorizable = True
|
|
70
|
+
name: str = "*"
|
|
71
|
+
alias: str = "mul"
|
|
72
|
+
is_binary: bool = True
|
|
73
|
+
is_symmetrical: bool = True
|
|
74
|
+
has_symmetry_importance: bool = True
|
|
75
|
+
is_vectorizable: bool = True
|
|
76
76
|
|
|
77
77
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
78
78
|
return left * right
|
|
@@ -86,12 +86,12 @@ class Multiply(PandasOperand, VectorizableMixin):
|
|
|
86
86
|
|
|
87
87
|
|
|
88
88
|
class Divide(PandasOperand, VectorizableMixin):
|
|
89
|
-
name = "/"
|
|
90
|
-
alias = "div"
|
|
91
|
-
is_binary = True
|
|
92
|
-
has_symmetry_importance = True
|
|
93
|
-
is_vectorizable = True
|
|
94
|
-
output_type = "float"
|
|
89
|
+
name: str = "/"
|
|
90
|
+
alias: str = "div"
|
|
91
|
+
is_binary: bool = True
|
|
92
|
+
has_symmetry_importance: bool = True
|
|
93
|
+
is_vectorizable: bool = True
|
|
94
|
+
output_type: Optional[str] = "float"
|
|
95
95
|
|
|
96
96
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
97
97
|
return left / right.replace(0, np.nan)
|
|
@@ -105,10 +105,10 @@ class Divide(PandasOperand, VectorizableMixin):
|
|
|
105
105
|
|
|
106
106
|
|
|
107
107
|
class Combine(PandasOperand):
|
|
108
|
-
name = "Combine"
|
|
109
|
-
is_binary = True
|
|
110
|
-
has_symmetry_importance = True
|
|
111
|
-
output_type = "object"
|
|
108
|
+
name: str = "Combine"
|
|
109
|
+
is_binary: bool = True
|
|
110
|
+
has_symmetry_importance: bool = True
|
|
111
|
+
output_type: Optional[str] = "object"
|
|
112
112
|
|
|
113
113
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
114
114
|
temp = left.astype(str) + "_" + right.astype(str)
|
|
@@ -117,13 +117,13 @@ class Combine(PandasOperand):
|
|
|
117
117
|
|
|
118
118
|
|
|
119
119
|
class CombineThenFreq(PandasOperand):
|
|
120
|
-
name = "CombineThenFreq"
|
|
121
|
-
is_binary = True
|
|
122
|
-
is_symmetrical = True
|
|
123
|
-
has_symmetry_importance = True
|
|
124
|
-
output_type = "float"
|
|
125
|
-
is_distribution_dependent = True
|
|
126
|
-
input_type = "discrete"
|
|
120
|
+
name: str = "CombineThenFreq"
|
|
121
|
+
is_binary: bool = True
|
|
122
|
+
is_symmetrical: bool = True
|
|
123
|
+
has_symmetry_importance: bool = True
|
|
124
|
+
output_type: Optional[str] = "float"
|
|
125
|
+
is_distribution_dependent: bool = True
|
|
126
|
+
input_type: Optional[str] = "discrete"
|
|
127
127
|
|
|
128
128
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
129
129
|
temp = left.astype(str) + "_" + right.astype(str)
|
|
@@ -133,15 +133,15 @@ class CombineThenFreq(PandasOperand):
|
|
|
133
133
|
|
|
134
134
|
|
|
135
135
|
class Distance(PandasOperand):
|
|
136
|
-
name = "dist"
|
|
137
|
-
is_binary = True
|
|
138
|
-
output_type = "float"
|
|
139
|
-
is_symmetrical = True
|
|
140
|
-
has_symmetry_importance = True
|
|
136
|
+
name: str = "dist"
|
|
137
|
+
is_binary: bool = True
|
|
138
|
+
output_type: Optional[str] = "float"
|
|
139
|
+
is_symmetrical: bool = True
|
|
140
|
+
has_symmetry_importance: bool = True
|
|
141
141
|
|
|
142
142
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
143
143
|
return pd.Series(
|
|
144
|
-
1 - self.__dot(left, right) / (self.
|
|
144
|
+
1 - self.__dot(left, right) / (self.__dot(left, left) * self.__dot(right, right)), index=left.index
|
|
145
145
|
)
|
|
146
146
|
|
|
147
147
|
# row-wise dot product
|
|
@@ -152,17 +152,14 @@ class Distance(PandasOperand):
|
|
|
152
152
|
res = res.reindex(left.index.union(right.index))
|
|
153
153
|
return res
|
|
154
154
|
|
|
155
|
-
def __norm(self, vector: pd.Series) -> pd.Series:
|
|
156
|
-
return np.sqrt(self.__dot(vector, vector))
|
|
157
|
-
|
|
158
155
|
|
|
159
156
|
# Left for backward compatibility
|
|
160
157
|
class Sim(Distance):
|
|
161
|
-
name = "sim"
|
|
162
|
-
is_binary = True
|
|
163
|
-
output_type = "float"
|
|
164
|
-
is_symmetrical = True
|
|
165
|
-
has_symmetry_importance = True
|
|
158
|
+
name: str = "sim"
|
|
159
|
+
is_binary: bool = True
|
|
160
|
+
output_type: Optional[str] = "float"
|
|
161
|
+
is_symmetrical: bool = True
|
|
162
|
+
has_symmetry_importance: bool = True
|
|
166
163
|
|
|
167
164
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
168
165
|
return 1 - super().calculate_binary(left, right)
|
|
@@ -191,12 +188,12 @@ class StringSim(PandasOperand, abc.ABC):
|
|
|
191
188
|
|
|
192
189
|
|
|
193
190
|
class JaroWinklerSim1(StringSim):
|
|
194
|
-
name = "sim_jw1"
|
|
195
|
-
is_binary = True
|
|
196
|
-
input_type = "string"
|
|
197
|
-
output_type = "float"
|
|
198
|
-
is_symmetrical = True
|
|
199
|
-
has_symmetry_importance = True
|
|
191
|
+
name: str = "sim_jw1"
|
|
192
|
+
is_binary: bool = True
|
|
193
|
+
input_type: Optional[str] = "string"
|
|
194
|
+
output_type: Optional[str] = "float"
|
|
195
|
+
is_symmetrical: bool = True
|
|
196
|
+
has_symmetry_importance: bool = True
|
|
200
197
|
|
|
201
198
|
def _prepare_value(self, value: Optional[str]) -> Optional[str]:
|
|
202
199
|
return value
|
|
@@ -206,12 +203,12 @@ class JaroWinklerSim1(StringSim):
|
|
|
206
203
|
|
|
207
204
|
|
|
208
205
|
class JaroWinklerSim2(StringSim):
|
|
209
|
-
name = "sim_jw2"
|
|
210
|
-
is_binary = True
|
|
211
|
-
input_type = "string"
|
|
212
|
-
output_type = "float"
|
|
213
|
-
is_symmetrical = True
|
|
214
|
-
has_symmetry_importance = True
|
|
206
|
+
name: str = "sim_jw2"
|
|
207
|
+
is_binary: bool = True
|
|
208
|
+
input_type: Optional[str] = "string"
|
|
209
|
+
output_type: Optional[str] = "float"
|
|
210
|
+
is_symmetrical: bool = True
|
|
211
|
+
has_symmetry_importance: bool = True
|
|
215
212
|
|
|
216
213
|
def _prepare_value(self, value: Optional[str]) -> Optional[str]:
|
|
217
214
|
return value[::-1] if value is not None else None
|
|
@@ -221,12 +218,12 @@ class JaroWinklerSim2(StringSim):
|
|
|
221
218
|
|
|
222
219
|
|
|
223
220
|
class LevenshteinSim(StringSim):
|
|
224
|
-
name = "sim_lv"
|
|
225
|
-
is_binary = True
|
|
226
|
-
input_type = "string"
|
|
227
|
-
output_type = "float"
|
|
228
|
-
is_symmetrical = True
|
|
229
|
-
has_symmetry_importance = True
|
|
221
|
+
name: str = "sim_lv"
|
|
222
|
+
is_binary: bool = True
|
|
223
|
+
input_type: Optional[str] = "string"
|
|
224
|
+
output_type: Optional[str] = "float"
|
|
225
|
+
is_symmetrical: bool = True
|
|
226
|
+
has_symmetry_importance: bool = True
|
|
230
227
|
|
|
231
228
|
def _prepare_value(self, value: Optional[str]) -> Optional[str]:
|
|
232
229
|
return value
|
upgini/autofe/date.py
CHANGED
|
@@ -4,7 +4,7 @@ from typing import Any, Dict, List, Optional, Union
|
|
|
4
4
|
import numpy as np
|
|
5
5
|
import pandas as pd
|
|
6
6
|
from pandas.core.arrays.timedeltas import TimedeltaArray
|
|
7
|
-
from pydantic import BaseModel,
|
|
7
|
+
from pydantic import BaseModel, field_validator
|
|
8
8
|
|
|
9
9
|
from upgini.autofe.operand import PandasOperand
|
|
10
10
|
|
|
@@ -38,10 +38,10 @@ class DateDiffMixin(BaseModel):
|
|
|
38
38
|
|
|
39
39
|
|
|
40
40
|
class DateDiff(PandasOperand, DateDiffMixin):
|
|
41
|
-
name = "date_diff"
|
|
42
|
-
alias = "date_diff_type1"
|
|
43
|
-
is_binary = True
|
|
44
|
-
has_symmetry_importance = True
|
|
41
|
+
name: str = "date_diff"
|
|
42
|
+
alias: Optional[str] = "date_diff_type1"
|
|
43
|
+
is_binary: bool = True
|
|
44
|
+
has_symmetry_importance: bool = True
|
|
45
45
|
|
|
46
46
|
replace_negative: bool = False
|
|
47
47
|
|
|
@@ -70,9 +70,9 @@ class DateDiff(PandasOperand, DateDiffMixin):
|
|
|
70
70
|
|
|
71
71
|
|
|
72
72
|
class DateDiffType2(PandasOperand, DateDiffMixin):
|
|
73
|
-
name = "date_diff_type2"
|
|
74
|
-
is_binary = True
|
|
75
|
-
has_symmetry_importance = True
|
|
73
|
+
name: str = "date_diff_type2"
|
|
74
|
+
is_binary: bool = True
|
|
75
|
+
has_symmetry_importance: bool = True
|
|
76
76
|
|
|
77
77
|
def get_params(self) -> Dict[str, Optional[str]]:
|
|
78
78
|
res = super().get_params()
|
|
@@ -104,8 +104,8 @@ _count_aggregations = ["nunique", "count"]
|
|
|
104
104
|
|
|
105
105
|
|
|
106
106
|
class DateListDiff(PandasOperand, DateDiffMixin):
|
|
107
|
-
is_binary = True
|
|
108
|
-
has_symmetry_importance = True
|
|
107
|
+
is_binary: bool = True
|
|
108
|
+
has_symmetry_importance: bool = True
|
|
109
109
|
|
|
110
110
|
aggregation: str
|
|
111
111
|
replace_negative: bool = False
|
|
@@ -165,8 +165,8 @@ class DateListDiff(PandasOperand, DateDiffMixin):
|
|
|
165
165
|
|
|
166
166
|
|
|
167
167
|
class DateListDiffBounded(DateListDiff):
|
|
168
|
-
lower_bound: Optional[int]
|
|
169
|
-
upper_bound: Optional[int]
|
|
168
|
+
lower_bound: Optional[int] = None
|
|
169
|
+
upper_bound: Optional[int] = None
|
|
170
170
|
|
|
171
171
|
def __init__(self, **data: Any) -> None:
|
|
172
172
|
if "name" not in data:
|
|
@@ -191,8 +191,8 @@ class DateListDiffBounded(DateListDiff):
|
|
|
191
191
|
|
|
192
192
|
|
|
193
193
|
class DatePercentileBase(PandasOperand, abc.ABC):
|
|
194
|
-
is_binary = True
|
|
195
|
-
output_type = "float"
|
|
194
|
+
is_binary: bool = True
|
|
195
|
+
output_type: Optional[str] = "float"
|
|
196
196
|
|
|
197
197
|
date_unit: Optional[str] = None
|
|
198
198
|
|
|
@@ -226,12 +226,12 @@ class DatePercentileBase(PandasOperand, abc.ABC):
|
|
|
226
226
|
|
|
227
227
|
|
|
228
228
|
class DatePercentile(DatePercentileBase):
|
|
229
|
-
name = "date_per"
|
|
230
|
-
alias = "date_per_method1"
|
|
229
|
+
name: str = "date_per"
|
|
230
|
+
alias: Optional[str] = "date_per_method1"
|
|
231
231
|
|
|
232
|
-
zero_month: Optional[int]
|
|
233
|
-
zero_year: Optional[int]
|
|
234
|
-
zero_bounds: Optional[List[float]]
|
|
232
|
+
zero_month: Optional[int] = None
|
|
233
|
+
zero_year: Optional[int] = None
|
|
234
|
+
zero_bounds: Optional[List[float]] = None
|
|
235
235
|
step: int = 30
|
|
236
236
|
|
|
237
237
|
def get_params(self) -> Dict[str, Optional[str]]:
|
|
@@ -246,7 +246,7 @@ class DatePercentile(DatePercentileBase):
|
|
|
246
246
|
)
|
|
247
247
|
return res
|
|
248
248
|
|
|
249
|
-
@
|
|
249
|
+
@field_validator("zero_bounds", mode="before")
|
|
250
250
|
def validate_bounds(cls, value):
|
|
251
251
|
if value is None or isinstance(value, list):
|
|
252
252
|
return value
|
|
@@ -264,7 +264,7 @@ class DatePercentile(DatePercentileBase):
|
|
|
264
264
|
|
|
265
265
|
|
|
266
266
|
class DatePercentileMethod2(DatePercentileBase):
|
|
267
|
-
name = "date_per_method2"
|
|
267
|
+
name: str = "date_per_method2"
|
|
268
268
|
|
|
269
269
|
def _get_bounds(self, date_col: pd.Series) -> pd.Series:
|
|
270
270
|
pass
|
upgini/autofe/feature.py
CHANGED
|
@@ -82,9 +82,9 @@ class Feature:
|
|
|
82
82
|
self.alias = alias
|
|
83
83
|
|
|
84
84
|
def set_op_params(self, params: Optional[Dict[str, str]]) -> "Feature":
|
|
85
|
-
obj_dict = self.op.
|
|
85
|
+
obj_dict = self.op.model_dump().copy()
|
|
86
86
|
obj_dict.update(params or {})
|
|
87
|
-
self.op = self.op.__class__.
|
|
87
|
+
self.op = self.op.__class__.model_validate(obj_dict)
|
|
88
88
|
self.op.set_params(params)
|
|
89
89
|
|
|
90
90
|
for child in self.children:
|
upgini/autofe/groupby.py
CHANGED
|
@@ -7,9 +7,9 @@ from upgini.autofe.operand import PandasOperand, VectorizableMixin
|
|
|
7
7
|
|
|
8
8
|
class GroupByThenAgg(PandasOperand, VectorizableMixin):
|
|
9
9
|
agg: Optional[str]
|
|
10
|
-
is_vectorizable = True
|
|
11
|
-
is_grouping = True
|
|
12
|
-
is_distribution_dependent = True
|
|
10
|
+
is_vectorizable: bool = True
|
|
11
|
+
is_grouping: bool = True
|
|
12
|
+
is_distribution_dependent: bool = True
|
|
13
13
|
|
|
14
14
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
15
15
|
temp = left.groupby(right).agg(self.agg)
|
|
@@ -24,17 +24,17 @@ class GroupByThenAgg(PandasOperand, VectorizableMixin):
|
|
|
24
24
|
|
|
25
25
|
|
|
26
26
|
class GroupByThenMedian(GroupByThenAgg):
|
|
27
|
-
name = "GroupByThenMedian"
|
|
28
|
-
pandas_agg = "median"
|
|
29
|
-
is_distribution_dependent = True
|
|
27
|
+
name: str = "GroupByThenMedian"
|
|
28
|
+
pandas_agg: str = "median"
|
|
29
|
+
is_distribution_dependent: bool = True
|
|
30
30
|
|
|
31
31
|
|
|
32
32
|
class GroupByThenRank(PandasOperand, VectorizableMixin):
|
|
33
|
-
name = "GroupByThenRank"
|
|
34
|
-
is_vectorizable = True
|
|
35
|
-
is_grouping = True
|
|
36
|
-
output_type = "float"
|
|
37
|
-
is_distribution_dependent = True
|
|
33
|
+
name: str = "GroupByThenRank"
|
|
34
|
+
is_vectorizable: bool = True
|
|
35
|
+
is_grouping: bool = True
|
|
36
|
+
output_type: Optional[str] = "float"
|
|
37
|
+
is_distribution_dependent: bool = True
|
|
38
38
|
|
|
39
39
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
40
40
|
temp = pd.DataFrame(left[~right.isna()].groupby(right).rank(ascending=True, pct=True)).reset_index()
|
|
@@ -49,12 +49,12 @@ class GroupByThenRank(PandasOperand, VectorizableMixin):
|
|
|
49
49
|
|
|
50
50
|
|
|
51
51
|
class GroupByThenNUnique(PandasOperand, VectorizableMixin):
|
|
52
|
-
name = "GroupByThenNUnique"
|
|
53
|
-
is_vectorizable = True
|
|
54
|
-
is_grouping = True
|
|
55
|
-
output_type = "int"
|
|
56
|
-
is_distribution_dependent = True
|
|
57
|
-
input_type = "discrete"
|
|
52
|
+
name: str = "GroupByThenNUnique"
|
|
53
|
+
is_vectorizable: bool = True
|
|
54
|
+
is_grouping: bool = True
|
|
55
|
+
output_type: Optional[str] = "int"
|
|
56
|
+
is_distribution_dependent: bool = True
|
|
57
|
+
input_type: Optional[str] = "discrete"
|
|
58
58
|
|
|
59
59
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
60
60
|
nunique = left.groupby(right).nunique()
|
|
@@ -69,11 +69,11 @@ class GroupByThenNUnique(PandasOperand, VectorizableMixin):
|
|
|
69
69
|
|
|
70
70
|
|
|
71
71
|
class GroupByThenFreq(PandasOperand):
|
|
72
|
-
name = "GroupByThenFreq"
|
|
73
|
-
is_grouping = True
|
|
74
|
-
output_type = "float"
|
|
75
|
-
is_distribution_dependent = True
|
|
76
|
-
input_type = "discrete"
|
|
72
|
+
name: str = "GroupByThenFreq"
|
|
73
|
+
is_grouping: bool = True
|
|
74
|
+
output_type: Optional[str] = "float"
|
|
75
|
+
is_distribution_dependent: bool = True
|
|
76
|
+
input_type: Optional[str] = "discrete"
|
|
77
77
|
|
|
78
78
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
79
79
|
def _f(x):
|
upgini/autofe/operand.py
CHANGED
|
@@ -8,19 +8,19 @@ from pydantic import BaseModel
|
|
|
8
8
|
|
|
9
9
|
class Operand(BaseModel):
|
|
10
10
|
name: str
|
|
11
|
-
alias: Optional[str]
|
|
11
|
+
alias: Optional[str] = None
|
|
12
12
|
is_unary: bool = False
|
|
13
13
|
is_symmetrical: bool = False
|
|
14
14
|
has_symmetry_importance: bool = False
|
|
15
|
-
input_type: Optional[str]
|
|
16
|
-
output_type: Optional[str]
|
|
15
|
+
input_type: Optional[str] = None
|
|
16
|
+
output_type: Optional[str] = None
|
|
17
17
|
is_categorical: bool = False
|
|
18
18
|
is_vectorizable: bool = False
|
|
19
19
|
is_grouping: bool = False
|
|
20
20
|
is_binary: bool = False
|
|
21
21
|
is_vector: bool = False
|
|
22
22
|
is_distribution_dependent: bool = False
|
|
23
|
-
params: Optional[Dict[str, str]]
|
|
23
|
+
params: Optional[Dict[str, str]] = None
|
|
24
24
|
|
|
25
25
|
def set_params(self, params: Dict[str, str]):
|
|
26
26
|
self.params = params
|
upgini/autofe/unary.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
from typing import Optional
|
|
1
2
|
import numpy as np
|
|
2
3
|
import pandas as pd
|
|
3
4
|
from sklearn.preprocessing import Normalizer
|
|
@@ -6,10 +7,10 @@ from upgini.autofe.operand import PandasOperand, VectorizableMixin
|
|
|
6
7
|
|
|
7
8
|
|
|
8
9
|
class Abs(PandasOperand, VectorizableMixin):
|
|
9
|
-
name = "abs"
|
|
10
|
-
is_unary = True
|
|
11
|
-
is_vectorizable = True
|
|
12
|
-
group_index = 0
|
|
10
|
+
name: str = "abs"
|
|
11
|
+
is_unary: bool = True
|
|
12
|
+
is_vectorizable: bool = True
|
|
13
|
+
group_index: int = 0
|
|
13
14
|
|
|
14
15
|
def calculate_unary(self, data: pd.Series) -> pd.Series:
|
|
15
16
|
return data.abs()
|
|
@@ -19,11 +20,11 @@ class Abs(PandasOperand, VectorizableMixin):
|
|
|
19
20
|
|
|
20
21
|
|
|
21
22
|
class Log(PandasOperand, VectorizableMixin):
|
|
22
|
-
name = "log"
|
|
23
|
-
is_unary = True
|
|
24
|
-
is_vectorizable = True
|
|
25
|
-
output_type = "float"
|
|
26
|
-
group_index = 0
|
|
23
|
+
name: str = "log"
|
|
24
|
+
is_unary: bool = True
|
|
25
|
+
is_vectorizable: bool = True
|
|
26
|
+
output_type: Optional[str] = "float"
|
|
27
|
+
group_index: int = 0
|
|
27
28
|
|
|
28
29
|
def calculate_unary(self, data: pd.Series) -> pd.Series:
|
|
29
30
|
return self._round_value(np.log(np.abs(data.replace(0, np.nan))), 10)
|
|
@@ -33,11 +34,11 @@ class Log(PandasOperand, VectorizableMixin):
|
|
|
33
34
|
|
|
34
35
|
|
|
35
36
|
class Sqrt(PandasOperand, VectorizableMixin):
|
|
36
|
-
name = "sqrt"
|
|
37
|
-
is_unary = True
|
|
38
|
-
is_vectorizable = True
|
|
39
|
-
output_type = "float"
|
|
40
|
-
group_index = 0
|
|
37
|
+
name: str = "sqrt"
|
|
38
|
+
is_unary: bool = True
|
|
39
|
+
is_vectorizable: bool = True
|
|
40
|
+
output_type: Optional[str] = "float"
|
|
41
|
+
group_index: int = 0
|
|
41
42
|
|
|
42
43
|
def calculate_unary(self, data: pd.Series) -> pd.Series:
|
|
43
44
|
return self._round_value(np.sqrt(np.abs(data)))
|
|
@@ -47,10 +48,10 @@ class Sqrt(PandasOperand, VectorizableMixin):
|
|
|
47
48
|
|
|
48
49
|
|
|
49
50
|
class Square(PandasOperand, VectorizableMixin):
|
|
50
|
-
name = "square"
|
|
51
|
-
is_unary = True
|
|
52
|
-
is_vectorizable = True
|
|
53
|
-
group_index = 0
|
|
51
|
+
name: str = "square"
|
|
52
|
+
is_unary: bool = True
|
|
53
|
+
is_vectorizable: bool = True
|
|
54
|
+
group_index: int = 0
|
|
54
55
|
|
|
55
56
|
def calculate_unary(self, data: pd.Series) -> pd.Series:
|
|
56
57
|
return np.square(data)
|
|
@@ -60,11 +61,11 @@ class Square(PandasOperand, VectorizableMixin):
|
|
|
60
61
|
|
|
61
62
|
|
|
62
63
|
class Sigmoid(PandasOperand, VectorizableMixin):
|
|
63
|
-
name = "sigmoid"
|
|
64
|
-
is_unary = True
|
|
65
|
-
is_vectorizable = True
|
|
66
|
-
output_type = "float"
|
|
67
|
-
group_index = 0
|
|
64
|
+
name: str = "sigmoid"
|
|
65
|
+
is_unary: bool = True
|
|
66
|
+
is_vectorizable: bool = True
|
|
67
|
+
output_type: Optional[str] = "float"
|
|
68
|
+
group_index: int = 0
|
|
68
69
|
|
|
69
70
|
def calculate_unary(self, data: pd.Series) -> pd.Series:
|
|
70
71
|
return self._round_value(1 / (1 + np.exp(-data)))
|
|
@@ -74,12 +75,12 @@ class Sigmoid(PandasOperand, VectorizableMixin):
|
|
|
74
75
|
|
|
75
76
|
|
|
76
77
|
class Floor(PandasOperand, VectorizableMixin):
|
|
77
|
-
name = "floor"
|
|
78
|
-
is_unary = True
|
|
79
|
-
is_vectorizable = True
|
|
80
|
-
output_type = "int"
|
|
81
|
-
input_type = "continuous"
|
|
82
|
-
group_index = 0
|
|
78
|
+
name: str = "floor"
|
|
79
|
+
is_unary: bool = True
|
|
80
|
+
is_vectorizable: bool = True
|
|
81
|
+
output_type: Optional[str] = "int"
|
|
82
|
+
input_type: Optional[str] = "continuous"
|
|
83
|
+
group_index: int = 0
|
|
83
84
|
|
|
84
85
|
def calculate_unary(self, data: pd.Series) -> pd.Series:
|
|
85
86
|
return np.floor(data)
|
|
@@ -89,11 +90,11 @@ class Floor(PandasOperand, VectorizableMixin):
|
|
|
89
90
|
|
|
90
91
|
|
|
91
92
|
class Residual(PandasOperand, VectorizableMixin):
|
|
92
|
-
name = "residual"
|
|
93
|
-
is_unary = True
|
|
94
|
-
is_vectorizable = True
|
|
95
|
-
input_type = "continuous"
|
|
96
|
-
group_index = 0
|
|
93
|
+
name: str = "residual"
|
|
94
|
+
is_unary: bool = True
|
|
95
|
+
is_vectorizable: bool = True
|
|
96
|
+
input_type: Optional[str] = "continuous"
|
|
97
|
+
group_index: int = 0
|
|
97
98
|
|
|
98
99
|
def calculate_unary(self, data: pd.Series) -> pd.Series:
|
|
99
100
|
return data - np.floor(data)
|
|
@@ -103,11 +104,11 @@ class Residual(PandasOperand, VectorizableMixin):
|
|
|
103
104
|
|
|
104
105
|
|
|
105
106
|
class Freq(PandasOperand):
|
|
106
|
-
name = "freq"
|
|
107
|
-
is_unary = True
|
|
108
|
-
output_type = "float"
|
|
109
|
-
is_distribution_dependent = True
|
|
110
|
-
input_type = "discrete"
|
|
107
|
+
name: str = "freq"
|
|
108
|
+
is_unary: bool = True
|
|
109
|
+
output_type: Optional[str] = "float"
|
|
110
|
+
is_distribution_dependent: bool = True
|
|
111
|
+
input_type: Optional[str] = "discrete"
|
|
111
112
|
|
|
112
113
|
def calculate_unary(self, data: pd.Series) -> pd.Series:
|
|
113
114
|
value_counts = data.value_counts(normalize=True)
|
|
@@ -115,9 +116,9 @@ class Freq(PandasOperand):
|
|
|
115
116
|
|
|
116
117
|
|
|
117
118
|
class Norm(PandasOperand):
|
|
118
|
-
name = "norm"
|
|
119
|
-
is_unary = True
|
|
120
|
-
output_type = "float"
|
|
119
|
+
name: str = "norm"
|
|
120
|
+
is_unary: bool = True
|
|
121
|
+
output_type: Optional[str] = "float"
|
|
121
122
|
|
|
122
123
|
def calculate_unary(self, data: pd.Series) -> pd.Series:
|
|
123
124
|
data_dropna = data.dropna()
|
|
@@ -131,7 +132,7 @@ class Norm(PandasOperand):
|
|
|
131
132
|
|
|
132
133
|
|
|
133
134
|
class Embeddings(PandasOperand):
|
|
134
|
-
name = "emb"
|
|
135
|
-
is_unary = True
|
|
136
|
-
input_type = "string"
|
|
137
|
-
output_type = "vector"
|
|
135
|
+
name: str = "emb"
|
|
136
|
+
is_unary: bool = True
|
|
137
|
+
input_type: Optional[str] = "string"
|
|
138
|
+
output_type: Optional[str] = "vector"
|
upgini/autofe/vector.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import List
|
|
1
|
+
from typing import List, Optional
|
|
2
2
|
|
|
3
3
|
import pandas as pd
|
|
4
4
|
|
|
@@ -6,19 +6,19 @@ from upgini.autofe.operand import PandasOperand, VectorizableMixin
|
|
|
6
6
|
|
|
7
7
|
|
|
8
8
|
class Mean(PandasOperand, VectorizableMixin):
|
|
9
|
-
name = "mean"
|
|
10
|
-
output_type = "float"
|
|
11
|
-
is_vector = True
|
|
12
|
-
group_index = 0
|
|
9
|
+
name: str = "mean"
|
|
10
|
+
output_type: Optional[str] = "float"
|
|
11
|
+
is_vector: bool = True
|
|
12
|
+
group_index: int = 0
|
|
13
13
|
|
|
14
14
|
def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
|
|
15
15
|
return pd.DataFrame(data).T.fillna(0).mean(axis=1)
|
|
16
16
|
|
|
17
17
|
|
|
18
18
|
class Sum(PandasOperand, VectorizableMixin):
|
|
19
|
-
name = "sum"
|
|
20
|
-
is_vector = True
|
|
21
|
-
group_index = 0
|
|
19
|
+
name: str = "sum"
|
|
20
|
+
is_vector: bool = True
|
|
21
|
+
group_index: int = 0
|
|
22
22
|
|
|
23
23
|
def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
|
|
24
24
|
return pd.DataFrame(data).T.fillna(0).sum(axis=1)
|
upgini/features_enricher.py
CHANGED
|
@@ -23,7 +23,6 @@ from pandas.api.types import (
|
|
|
23
23
|
is_datetime64_any_dtype,
|
|
24
24
|
is_numeric_dtype,
|
|
25
25
|
is_object_dtype,
|
|
26
|
-
is_period_dtype,
|
|
27
26
|
is_string_dtype,
|
|
28
27
|
)
|
|
29
28
|
from scipy.stats import ks_2samp
|
|
@@ -1408,7 +1407,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1408
1407
|
# TODO maybe there is no more need for these convertions
|
|
1409
1408
|
# Remove datetime features
|
|
1410
1409
|
datetime_features = [
|
|
1411
|
-
f
|
|
1410
|
+
f
|
|
1411
|
+
for f in fitting_X.columns
|
|
1412
|
+
if is_datetime64_any_dtype(fitting_X[f]) or isinstance(fitting_X[f].dtype, pd.PeriodDtype)
|
|
1412
1413
|
]
|
|
1413
1414
|
if len(datetime_features) > 0:
|
|
1414
1415
|
self.logger.warning(self.bundle.get("dataset_date_features").format(datetime_features))
|
upgini/http.py
CHANGED
|
@@ -39,17 +39,6 @@ from upgini.metadata import (
|
|
|
39
39
|
from upgini.resource_bundle import bundle
|
|
40
40
|
from upgini.utils.track_info import get_track_metrics
|
|
41
41
|
|
|
42
|
-
# try:
|
|
43
|
-
# from importlib.metadata import version # type: ignore
|
|
44
|
-
|
|
45
|
-
# __version__ = version("upgini")
|
|
46
|
-
# except ImportError:
|
|
47
|
-
# try:
|
|
48
|
-
# from importlib_metadata import version # type: ignore
|
|
49
|
-
|
|
50
|
-
# __version__ = version("upgini")
|
|
51
|
-
# except ImportError:
|
|
52
|
-
# __version__ = "Upgini wasn't installed"
|
|
53
42
|
|
|
54
43
|
UPGINI_URL: str = "UPGINI_URL"
|
|
55
44
|
UPGINI_API_KEY: str = "UPGINI_API_KEY"
|
|
@@ -452,18 +441,18 @@ class _RestClient:
|
|
|
452
441
|
content = file.read()
|
|
453
442
|
md5_hash.update(content)
|
|
454
443
|
digest = md5_hash.hexdigest()
|
|
455
|
-
metadata_with_md5 = metadata.
|
|
444
|
+
metadata_with_md5 = metadata.model_copy(update={"checksumMD5": digest})
|
|
456
445
|
|
|
457
446
|
digest_sha256 = hashlib.sha256(
|
|
458
447
|
pd.util.hash_pandas_object(pd.read_parquet(file_path, engine="fastparquet")).values
|
|
459
448
|
).hexdigest()
|
|
460
|
-
metadata_with_md5 = metadata_with_md5.
|
|
449
|
+
metadata_with_md5 = metadata_with_md5.model_copy(update={"digest": digest_sha256})
|
|
461
450
|
|
|
462
451
|
with open(file_path, "rb") as file:
|
|
463
452
|
files = {
|
|
464
453
|
"metadata": (
|
|
465
454
|
"metadata.json",
|
|
466
|
-
metadata_with_md5.
|
|
455
|
+
metadata_with_md5.model_dump_json(exclude_none=True).encode(),
|
|
467
456
|
"application/json",
|
|
468
457
|
),
|
|
469
458
|
"tracking": (
|
|
@@ -471,13 +460,13 @@ class _RestClient:
|
|
|
471
460
|
dumps(track_metrics).encode(),
|
|
472
461
|
"application/json",
|
|
473
462
|
),
|
|
474
|
-
"metrics": ("metrics.json", metrics.
|
|
463
|
+
"metrics": ("metrics.json", metrics.model_dump_json(exclude_none=True).encode(), "application/json"),
|
|
475
464
|
"file": (metadata_with_md5.name, file, "application/octet-stream"),
|
|
476
465
|
}
|
|
477
466
|
if search_customization is not None:
|
|
478
467
|
files["customization"] = (
|
|
479
468
|
"customization.json",
|
|
480
|
-
search_customization.
|
|
469
|
+
search_customization.model_dump_json(exclude_none=True).encode(),
|
|
481
470
|
"application/json",
|
|
482
471
|
)
|
|
483
472
|
additional_headers = {self.SEARCH_KEYS_HEADER_NAME: ",".join(self.search_keys_meaning_types(metadata))}
|
|
@@ -492,7 +481,7 @@ class _RestClient:
|
|
|
492
481
|
def check_uploaded_file_v2(self, trace_id: str, file_upload_id: str, metadata: FileMetadata) -> bool:
|
|
493
482
|
api_path = self.CHECK_UPLOADED_FILE_URL_FMT_V2.format(file_upload_id)
|
|
494
483
|
response = self._with_unauth_retry(
|
|
495
|
-
lambda: self._send_post_req(api_path, trace_id, metadata.
|
|
484
|
+
lambda: self._send_post_req(api_path, trace_id, metadata.model_dump_json(exclude_none=True))
|
|
496
485
|
)
|
|
497
486
|
return bool(response)
|
|
498
487
|
|
|
@@ -506,11 +495,11 @@ class _RestClient:
|
|
|
506
495
|
) -> SearchTaskResponse:
|
|
507
496
|
api_path = self.INITIAL_SEARCH_WITHOUT_UPLOAD_URI_FMT_V2.format(file_upload_id)
|
|
508
497
|
files = {
|
|
509
|
-
"metadata": ("metadata.json", metadata.
|
|
510
|
-
"metrics": ("metrics.json", metrics.
|
|
498
|
+
"metadata": ("metadata.json", metadata.model_dump_json(exclude_none=True).encode(), "application/json"),
|
|
499
|
+
"metrics": ("metrics.json", metrics.model_dump_json(exclude_none=True).encode(), "application/json"),
|
|
511
500
|
}
|
|
512
501
|
if search_customization is not None:
|
|
513
|
-
files["customization"] = search_customization.
|
|
502
|
+
files["customization"] = search_customization.model_dump_json(exclude_none=True).encode()
|
|
514
503
|
additional_headers = {self.SEARCH_KEYS_HEADER_NAME: ",".join(self.search_keys_meaning_types(metadata))}
|
|
515
504
|
response = self._with_unauth_retry(
|
|
516
505
|
lambda: self._send_post_file_req_v2(
|
|
@@ -536,18 +525,18 @@ class _RestClient:
|
|
|
536
525
|
content = file.read()
|
|
537
526
|
md5_hash.update(content)
|
|
538
527
|
digest = md5_hash.hexdigest()
|
|
539
|
-
metadata_with_md5 = metadata.
|
|
528
|
+
metadata_with_md5 = metadata.model_copy(update={"checksumMD5": digest})
|
|
540
529
|
|
|
541
530
|
digest_sha256 = hashlib.sha256(
|
|
542
531
|
pd.util.hash_pandas_object(pd.read_parquet(file_path, engine="fastparquet")).values
|
|
543
532
|
).hexdigest()
|
|
544
|
-
metadata_with_md5 = metadata_with_md5.
|
|
533
|
+
metadata_with_md5 = metadata_with_md5.model_copy(update={"digest": digest_sha256})
|
|
545
534
|
|
|
546
535
|
with open(file_path, "rb") as file:
|
|
547
536
|
files = {
|
|
548
537
|
"metadata": (
|
|
549
538
|
"metadata.json",
|
|
550
|
-
metadata_with_md5.
|
|
539
|
+
metadata_with_md5.model_dump_json(exclude_none=True).encode(),
|
|
551
540
|
"application/json",
|
|
552
541
|
),
|
|
553
542
|
"tracking": (
|
|
@@ -555,13 +544,13 @@ class _RestClient:
|
|
|
555
544
|
dumps(get_track_metrics(self.client_ip, self.client_visitorid)).encode(),
|
|
556
545
|
"application/json",
|
|
557
546
|
),
|
|
558
|
-
"metrics": ("metrics.json", metrics.
|
|
547
|
+
"metrics": ("metrics.json", metrics.model_dump_json(exclude_none=True).encode(), "application/json"),
|
|
559
548
|
"file": (metadata_with_md5.name, file, "application/octet-stream"),
|
|
560
549
|
}
|
|
561
550
|
if search_customization is not None:
|
|
562
551
|
files["customization"] = (
|
|
563
552
|
"customization.json",
|
|
564
|
-
search_customization.
|
|
553
|
+
search_customization.model_dump_json(exclude_none=True).encode(),
|
|
565
554
|
"application/json",
|
|
566
555
|
)
|
|
567
556
|
|
|
@@ -585,11 +574,11 @@ class _RestClient:
|
|
|
585
574
|
) -> SearchTaskResponse:
|
|
586
575
|
api_path = self.VALIDATION_SEARCH_WITHOUT_UPLOAD_URI_FMT_V2.format(file_upload_id, initial_search_task_id)
|
|
587
576
|
files = {
|
|
588
|
-
"metadata": ("metadata.json", metadata.
|
|
589
|
-
"metrics": ("metrics.json", metrics.
|
|
577
|
+
"metadata": ("metadata.json", metadata.model_dump_json(exclude_none=True).encode(), "application/json"),
|
|
578
|
+
"metrics": ("metrics.json", metrics.model_dump_json(exclude_none=True).encode(), "application/json"),
|
|
590
579
|
}
|
|
591
580
|
if search_customization is not None:
|
|
592
|
-
files["customization"] = search_customization.
|
|
581
|
+
files["customization"] = search_customization.model_dump_json(exclude_none=True).encode()
|
|
593
582
|
additional_headers = {self.SEARCH_KEYS_HEADER_NAME: ",".join(self.search_keys_meaning_types(metadata))}
|
|
594
583
|
response = self._with_unauth_retry(
|
|
595
584
|
lambda: self._send_post_file_req_v2(
|
|
@@ -651,7 +640,7 @@ class _RestClient:
|
|
|
651
640
|
with open(file_path, "rb") as file:
|
|
652
641
|
files = {
|
|
653
642
|
"file": (metadata.name, file, "application/octet-stream"),
|
|
654
|
-
"metadata": ("metadata.json", metadata.
|
|
643
|
+
"metadata": ("metadata.json", metadata.model_dump_json(exclude_none=True).encode(), "application/json"),
|
|
655
644
|
}
|
|
656
645
|
|
|
657
646
|
return self._send_post_file_req_v2(api_path, files)
|
|
@@ -661,12 +650,12 @@ class _RestClient:
|
|
|
661
650
|
def get_search_file_metadata(self, search_task_id: str, trace_id: str) -> FileMetadata:
|
|
662
651
|
api_path = self.SEARCH_FILE_METADATA_URI_FMT_V2.format(search_task_id)
|
|
663
652
|
response = self._with_unauth_retry(lambda: self._send_get_req(api_path, trace_id))
|
|
664
|
-
return FileMetadata.
|
|
653
|
+
return FileMetadata.model_validate(response)
|
|
665
654
|
|
|
666
655
|
def get_provider_search_metadata_v3(self, provider_search_task_id: str, trace_id: str) -> ProviderTaskMetadataV2:
|
|
667
656
|
api_path = self.SEARCH_TASK_METADATA_FMT_V3.format(provider_search_task_id)
|
|
668
657
|
response = self._with_unauth_retry(lambda: self._send_get_req(api_path, trace_id))
|
|
669
|
-
return ProviderTaskMetadataV2.
|
|
658
|
+
return ProviderTaskMetadataV2.model_validate(response)
|
|
670
659
|
|
|
671
660
|
def get_current_transform_usage(self, trace_id) -> TransformUsage:
|
|
672
661
|
track_metrics = get_track_metrics(self.client_ip, self.client_visitorid)
|
upgini/lazy_import.py
CHANGED
|
@@ -1,4 +1,6 @@
|
|
|
1
1
|
import importlib
|
|
2
|
+
import importlib.util
|
|
3
|
+
import importlib.machinery
|
|
2
4
|
|
|
3
5
|
|
|
4
6
|
class LazyImport:
|
|
@@ -10,7 +12,18 @@ class LazyImport:
|
|
|
10
12
|
|
|
11
13
|
def _load(self):
|
|
12
14
|
if self._module is None:
|
|
13
|
-
|
|
15
|
+
# Load module and save link to it
|
|
16
|
+
spec = importlib.util.find_spec(self.module_name)
|
|
17
|
+
if spec is None:
|
|
18
|
+
raise ImportError(f"Module {self.module_name} not found")
|
|
19
|
+
|
|
20
|
+
# Create module
|
|
21
|
+
self._module = importlib.util.module_from_spec(spec)
|
|
22
|
+
|
|
23
|
+
# Execute module
|
|
24
|
+
spec.loader.exec_module(self._module)
|
|
25
|
+
|
|
26
|
+
# Get class from module
|
|
14
27
|
self._class = getattr(self._module, self.class_name)
|
|
15
28
|
|
|
16
29
|
def __call__(self, *args, **kwargs):
|
upgini/metadata.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
from enum import Enum
|
|
4
|
-
from typing import Dict, List, Optional, Set, Union
|
|
4
|
+
from typing import Any, Dict, List, Optional, Set, Union
|
|
5
5
|
|
|
6
6
|
from pydantic import BaseModel
|
|
7
7
|
|
|
@@ -172,23 +172,23 @@ class FileMetricsInterval(BaseModel):
|
|
|
172
172
|
date_cut: float
|
|
173
173
|
count: float
|
|
174
174
|
valid_count: float
|
|
175
|
-
avg_target: Optional[float] # not for multiclass
|
|
176
|
-
avg_score_etalon: Optional[float]
|
|
175
|
+
avg_target: Optional[float] = None # not for multiclass
|
|
176
|
+
avg_score_etalon: Optional[float] = None
|
|
177
177
|
|
|
178
178
|
|
|
179
179
|
class FileMetrics(BaseModel):
|
|
180
180
|
# etalon metadata
|
|
181
|
-
task_type: Optional[ModelTaskType]
|
|
182
|
-
label: Optional[ModelLabelType]
|
|
183
|
-
count: Optional[int]
|
|
184
|
-
valid_count: Optional[int]
|
|
185
|
-
valid_rate: Optional[float]
|
|
186
|
-
avg_target: Optional[float]
|
|
187
|
-
metrics_binary_etalon: Optional[BinaryTask]
|
|
188
|
-
metrics_regression_etalon: Optional[RegressionTask]
|
|
189
|
-
metrics_multiclass_etalon: Optional[MulticlassTask]
|
|
190
|
-
cuts: Optional[List[float]]
|
|
191
|
-
interval: Optional[List[FileMetricsInterval]]
|
|
181
|
+
task_type: Optional[ModelTaskType] = None
|
|
182
|
+
label: Optional[ModelLabelType] = None
|
|
183
|
+
count: Optional[int] = None
|
|
184
|
+
valid_count: Optional[int] = None
|
|
185
|
+
valid_rate: Optional[float] = None
|
|
186
|
+
avg_target: Optional[float] = None
|
|
187
|
+
metrics_binary_etalon: Optional[BinaryTask] = None
|
|
188
|
+
metrics_regression_etalon: Optional[RegressionTask] = None
|
|
189
|
+
metrics_multiclass_etalon: Optional[MulticlassTask] = None
|
|
190
|
+
cuts: Optional[List[float]] = None
|
|
191
|
+
interval: Optional[List[FileMetricsInterval]] = None
|
|
192
192
|
|
|
193
193
|
|
|
194
194
|
class NumericInterval(BaseModel):
|
|
@@ -202,25 +202,25 @@ class FileColumnMetadata(BaseModel):
|
|
|
202
202
|
dataType: DataType
|
|
203
203
|
meaningType: FileColumnMeaningType
|
|
204
204
|
minMaxValues: Optional[NumericInterval] = None
|
|
205
|
-
originalName: Optional[str]
|
|
205
|
+
originalName: Optional[str] = None
|
|
206
206
|
# is this column contains keys from multiple key columns like msisdn1, msisdn2
|
|
207
207
|
isUnnest: bool = False
|
|
208
208
|
# list of original etalon key column names like msisdn1, msisdn2
|
|
209
|
-
unnestKeyNames: Optional[List[str]]
|
|
209
|
+
unnestKeyNames: Optional[List[str]] = None
|
|
210
210
|
|
|
211
211
|
|
|
212
212
|
class FileMetadata(BaseModel):
|
|
213
213
|
name: str
|
|
214
|
-
description: Optional[str]
|
|
214
|
+
description: Optional[str] = None
|
|
215
215
|
columns: List[FileColumnMetadata]
|
|
216
216
|
searchKeys: List[List[str]]
|
|
217
|
-
excludeFeaturesSources: Optional[List[str]]
|
|
218
|
-
hierarchicalGroupKeys: Optional[List[str]]
|
|
219
|
-
hierarchicalSubgroupKeys: Optional[List[str]]
|
|
220
|
-
taskType: Optional[ModelTaskType]
|
|
221
|
-
rowsCount: Optional[int]
|
|
222
|
-
checksumMD5: Optional[str]
|
|
223
|
-
digest: Optional[str]
|
|
217
|
+
excludeFeaturesSources: Optional[List[str]] = None
|
|
218
|
+
hierarchicalGroupKeys: Optional[List[str]] = None
|
|
219
|
+
hierarchicalSubgroupKeys: Optional[List[str]] = None
|
|
220
|
+
taskType: Optional[ModelTaskType] = None
|
|
221
|
+
rowsCount: Optional[int] = None
|
|
222
|
+
checksumMD5: Optional[str] = None
|
|
223
|
+
digest: Optional[str] = None
|
|
224
224
|
|
|
225
225
|
def column_by_name(self, name: str) -> Optional[FileColumnMetadata]:
|
|
226
226
|
for c in self.columns:
|
|
@@ -244,17 +244,17 @@ class FeaturesMetadataV2(BaseModel):
|
|
|
244
244
|
source: str
|
|
245
245
|
hit_rate: float
|
|
246
246
|
shap_value: float
|
|
247
|
-
commercial_schema: Optional[str]
|
|
248
|
-
data_provider: Optional[str]
|
|
249
|
-
data_providers: Optional[List[str]]
|
|
250
|
-
data_provider_link: Optional[str]
|
|
251
|
-
data_provider_links: Optional[List[str]]
|
|
252
|
-
data_source: Optional[str]
|
|
253
|
-
data_sources: Optional[List[str]]
|
|
254
|
-
data_source_link: Optional[str]
|
|
255
|
-
data_source_links: Optional[List[str]]
|
|
256
|
-
doc_link: Optional[str]
|
|
257
|
-
update_frequency: Optional[str]
|
|
247
|
+
commercial_schema: Optional[str] = None
|
|
248
|
+
data_provider: Optional[str] = None
|
|
249
|
+
data_providers: Optional[List[str]] = None
|
|
250
|
+
data_provider_link: Optional[str] = None
|
|
251
|
+
data_provider_links: Optional[List[str]] = None
|
|
252
|
+
data_source: Optional[str] = None
|
|
253
|
+
data_sources: Optional[List[str]] = None
|
|
254
|
+
data_source_link: Optional[str] = None
|
|
255
|
+
data_source_links: Optional[List[str]] = None
|
|
256
|
+
doc_link: Optional[str] = None
|
|
257
|
+
update_frequency: Optional[str] = None
|
|
258
258
|
|
|
259
259
|
|
|
260
260
|
class HitRateMetrics(BaseModel):
|
|
@@ -274,48 +274,48 @@ class ModelEvalSet(BaseModel):
|
|
|
274
274
|
class BaseColumnMetadata(BaseModel):
|
|
275
275
|
original_name: str
|
|
276
276
|
hashed_name: str
|
|
277
|
-
ads_definition_id: Optional[str]
|
|
277
|
+
ads_definition_id: Optional[str] = None
|
|
278
278
|
is_augmented: bool
|
|
279
279
|
|
|
280
280
|
|
|
281
281
|
class GeneratedFeatureMetadata(BaseModel):
|
|
282
|
-
alias: Optional[str]
|
|
282
|
+
alias: Optional[str] = None
|
|
283
283
|
formula: str
|
|
284
284
|
display_index: str
|
|
285
285
|
base_columns: List[BaseColumnMetadata]
|
|
286
|
-
operator_params: Optional[Dict[str, str]]
|
|
286
|
+
operator_params: Optional[Dict[str, str]] = None
|
|
287
287
|
|
|
288
288
|
|
|
289
289
|
class ProviderTaskMetadataV2(BaseModel):
|
|
290
290
|
features: List[FeaturesMetadataV2]
|
|
291
|
-
hit_rate_metrics: Optional[HitRateMetrics]
|
|
292
|
-
eval_set_metrics: Optional[List[ModelEvalSet]]
|
|
293
|
-
zero_hit_rate_search_keys: Optional[List[str]]
|
|
294
|
-
features_used_for_embeddings: Optional[List[str]]
|
|
295
|
-
shuffle_kfold: Optional[bool]
|
|
296
|
-
generated_features: Optional[List[GeneratedFeatureMetadata]]
|
|
291
|
+
hit_rate_metrics: Optional[HitRateMetrics] = None
|
|
292
|
+
eval_set_metrics: Optional[List[ModelEvalSet]] = None
|
|
293
|
+
zero_hit_rate_search_keys: Optional[List[str]] = None
|
|
294
|
+
features_used_for_embeddings: Optional[List[str]] = None
|
|
295
|
+
shuffle_kfold: Optional[bool] = None
|
|
296
|
+
generated_features: Optional[List[GeneratedFeatureMetadata]] = None
|
|
297
297
|
|
|
298
298
|
|
|
299
299
|
class FeaturesFilter(BaseModel):
|
|
300
|
-
minImportance: Optional[float]
|
|
301
|
-
maxPSI: Optional[float]
|
|
302
|
-
maxCount: Optional[int]
|
|
303
|
-
selectedFeatures: Optional[List[str]]
|
|
300
|
+
minImportance: Optional[float] = None
|
|
301
|
+
maxPSI: Optional[float] = None
|
|
302
|
+
maxCount: Optional[int] = None
|
|
303
|
+
selectedFeatures: Optional[List[str]] = None
|
|
304
304
|
|
|
305
305
|
|
|
306
306
|
class RuntimeParameters(BaseModel):
|
|
307
|
-
properties: Dict[str,
|
|
307
|
+
properties: Dict[str, Any] = {}
|
|
308
308
|
|
|
309
309
|
|
|
310
310
|
class SearchCustomization(BaseModel):
|
|
311
|
-
featuresFilter: Optional[FeaturesFilter]
|
|
312
|
-
extractFeatures: Optional[bool]
|
|
313
|
-
accurateModel: Optional[bool]
|
|
314
|
-
importanceThreshold: Optional[float]
|
|
315
|
-
maxFeatures: Optional[int]
|
|
316
|
-
returnScores: Optional[bool]
|
|
317
|
-
runtimeParameters: Optional[RuntimeParameters]
|
|
318
|
-
metricsCalculation: Optional[bool]
|
|
311
|
+
featuresFilter: Optional[FeaturesFilter] = None
|
|
312
|
+
extractFeatures: Optional[bool] = None
|
|
313
|
+
accurateModel: Optional[bool] = None
|
|
314
|
+
importanceThreshold: Optional[float] = None
|
|
315
|
+
maxFeatures: Optional[int] = None
|
|
316
|
+
returnScores: Optional[bool] = None
|
|
317
|
+
runtimeParameters: Optional[RuntimeParameters] = None
|
|
318
|
+
metricsCalculation: Optional[bool] = None
|
|
319
319
|
|
|
320
320
|
def __repr__(self):
|
|
321
321
|
return (
|
|
@@ -10,7 +10,6 @@ from pandas.api.types import (
|
|
|
10
10
|
is_float_dtype,
|
|
11
11
|
is_numeric_dtype,
|
|
12
12
|
is_object_dtype,
|
|
13
|
-
is_period_dtype,
|
|
14
13
|
is_string_dtype,
|
|
15
14
|
)
|
|
16
15
|
|
|
@@ -135,7 +134,7 @@ class Normalizer:
|
|
|
135
134
|
|
|
136
135
|
removed_features = []
|
|
137
136
|
for f in features:
|
|
138
|
-
if is_datetime(df[f]) or
|
|
137
|
+
if is_datetime(df[f]) or isinstance(df[f].dtype, pd.PeriodDtype):
|
|
139
138
|
removed_features.append(f)
|
|
140
139
|
df.drop(columns=f, inplace=True)
|
|
141
140
|
|
upgini/utils/datetime_utils.py
CHANGED
|
@@ -6,7 +6,7 @@ from typing import Dict, List, Optional
|
|
|
6
6
|
import numpy as np
|
|
7
7
|
import pandas as pd
|
|
8
8
|
from dateutil.relativedelta import relativedelta
|
|
9
|
-
from pandas.api.types import is_numeric_dtype
|
|
9
|
+
from pandas.api.types import is_numeric_dtype
|
|
10
10
|
|
|
11
11
|
from upgini.errors import ValidationError
|
|
12
12
|
from upgini.metadata import EVAL_SET_INDEX, SearchKey
|
|
@@ -84,7 +84,7 @@ class DateTimeSearchKeyConverter:
|
|
|
84
84
|
df[self.date_column] = df[self.date_column].apply(lambda x: x.replace(tzinfo=None))
|
|
85
85
|
elif isinstance(df[self.date_column].values[0], datetime.date):
|
|
86
86
|
df[self.date_column] = pd.to_datetime(df[self.date_column], errors="coerce")
|
|
87
|
-
elif
|
|
87
|
+
elif isinstance(df[self.date_column].dtype, pd.PeriodDtype):
|
|
88
88
|
df[self.date_column] = df[self.date_column].dt.to_timestamp()
|
|
89
89
|
elif is_numeric_dtype(df[self.date_column]):
|
|
90
90
|
# 315532801 - 2524608001 - seconds
|
|
@@ -207,7 +207,7 @@ def is_time_series(df: pd.DataFrame, date_col: str) -> bool:
|
|
|
207
207
|
def is_blocked_time_series(df: pd.DataFrame, date_col: str, search_keys: List[str]) -> bool:
|
|
208
208
|
df = df.copy()
|
|
209
209
|
seconds = "datetime_seconds"
|
|
210
|
-
if
|
|
210
|
+
if isinstance(df[date_col].dtype, pd.PeriodDtype):
|
|
211
211
|
df[date_col] = df[date_col].dt.to_timestamp()
|
|
212
212
|
else:
|
|
213
213
|
df[date_col] = pd.to_datetime(df[date_col])
|
|
@@ -275,7 +275,7 @@ def validate_dates_distribution(
|
|
|
275
275
|
if col in search_keys:
|
|
276
276
|
continue
|
|
277
277
|
try:
|
|
278
|
-
if
|
|
278
|
+
if isinstance(X[col].dtype, pd.PeriodDtype):
|
|
279
279
|
pass
|
|
280
280
|
elif pd.__version__ >= "2.0.0":
|
|
281
281
|
# Format mixed to avoid massive warnings
|
|
@@ -290,7 +290,7 @@ def validate_dates_distribution(
|
|
|
290
290
|
if maybe_date_col is None:
|
|
291
291
|
return
|
|
292
292
|
|
|
293
|
-
if
|
|
293
|
+
if isinstance(X[maybe_date_col].dtype, pd.PeriodDtype):
|
|
294
294
|
dates = X[maybe_date_col].dt.to_timestamp().dt.date
|
|
295
295
|
elif pd.__version__ >= "2.0.0":
|
|
296
296
|
dates = pd.to_datetime(X[maybe_date_col], format="mixed").dt.date
|
upgini/utils/phone_utils.py
CHANGED
|
@@ -1,12 +1,8 @@
|
|
|
1
1
|
from typing import Optional
|
|
2
2
|
|
|
3
|
+
import numpy as np
|
|
3
4
|
import pandas as pd
|
|
4
|
-
from pandas.api.types import
|
|
5
|
-
is_float_dtype,
|
|
6
|
-
is_int64_dtype,
|
|
7
|
-
is_object_dtype,
|
|
8
|
-
is_string_dtype,
|
|
9
|
-
)
|
|
5
|
+
from pandas.api.types import is_float_dtype, is_object_dtype, is_string_dtype
|
|
10
6
|
|
|
11
7
|
from upgini.errors import ValidationError
|
|
12
8
|
from upgini.utils.base_search_key_detector import BaseSearchKeyDetector
|
|
@@ -63,7 +59,9 @@ class PhoneSearchKeyConverter:
|
|
|
63
59
|
convert_func = self.phone_str_to_int_safe
|
|
64
60
|
elif is_float_dtype(df[self.phone_column]):
|
|
65
61
|
convert_func = self.phone_float_to_int_safe
|
|
66
|
-
elif
|
|
62
|
+
elif df[self.phone_column].dtype == np.int64 or isinstance(
|
|
63
|
+
df[self.phone_column].dtype, pd.Int64Dtype
|
|
64
|
+
):
|
|
67
65
|
convert_func = self.phone_int_to_int_safe
|
|
68
66
|
else:
|
|
69
67
|
raise ValidationError(
|
upgini/utils/target_utils.py
CHANGED
|
@@ -194,4 +194,7 @@ def calculate_psi(expected: pd.Series, actual: pd.Series) -> float:
|
|
|
194
194
|
test_distribution = actual.value_counts(bins=bins, normalize=True).sort_index().values
|
|
195
195
|
|
|
196
196
|
# Calculate the PSI
|
|
197
|
-
|
|
197
|
+
try:
|
|
198
|
+
return np.sum((train_distribution - test_distribution) * np.log(train_distribution / test_distribution))
|
|
199
|
+
except Exception:
|
|
200
|
+
return np.nan
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: upgini
|
|
3
|
-
Version: 1.1.
|
|
3
|
+
Version: 1.1.316a1
|
|
4
4
|
Summary: Intelligent data search & enrichment for Machine Learning
|
|
5
5
|
Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
|
|
6
6
|
Project-URL: Homepage, https://upgini.com/
|
|
@@ -29,9 +29,9 @@ Requires-Dist: ipywidgets>=8.1.0
|
|
|
29
29
|
Requires-Dist: jarowinkler>=2.0.0
|
|
30
30
|
Requires-Dist: levenshtein>=0.25.1
|
|
31
31
|
Requires-Dist: lightgbm>=3.3.2
|
|
32
|
-
Requires-Dist: numpy
|
|
32
|
+
Requires-Dist: numpy<=1.26.4,>=1.19.0
|
|
33
33
|
Requires-Dist: pandas<3.0.0,>=1.1.0
|
|
34
|
-
Requires-Dist: pydantic
|
|
34
|
+
Requires-Dist: pydantic>=2.7.0
|
|
35
35
|
Requires-Dist: pyjwt>=2.8.0
|
|
36
36
|
Requires-Dist: python-bidi==0.4.2
|
|
37
37
|
Requires-Dist: python-dateutil>=2.8.0
|
|
@@ -1,12 +1,12 @@
|
|
|
1
|
-
upgini/__about__.py,sha256=
|
|
1
|
+
upgini/__about__.py,sha256=31UCeRaXiz7DRsmJ3BKvypU0ky5w4Itv5qqPPf4BU9I,26
|
|
2
2
|
upgini/__init__.py,sha256=Xs0YFVBu1KUdtZzbStGRPQtLt3YLzJnjx5nIUBlX8BE,415
|
|
3
3
|
upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
|
|
4
4
|
upgini/dataset.py,sha256=yAWIygHejxdKXOA4g3QjtCu0VRa9at-4nPPuugCr77U,30857
|
|
5
5
|
upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
|
|
6
|
-
upgini/features_enricher.py,sha256=
|
|
7
|
-
upgini/http.py,sha256=
|
|
8
|
-
upgini/lazy_import.py,sha256=
|
|
9
|
-
upgini/metadata.py,sha256=
|
|
6
|
+
upgini/features_enricher.py,sha256=_d8ya5RRoYN0o6mV6gda-bLdOngQ4rb1SA51SlM_TG0,188002
|
|
7
|
+
upgini/http.py,sha256=_A_DGMk8gkygdVFCDp8I6js_re4YX34PB9TpJV8aPqo,42784
|
|
8
|
+
upgini/lazy_import.py,sha256=74gQ8JuA48BGRLxAo7lNHNKY2D2emMxrUxKGdxVGhuY,1012
|
|
9
|
+
upgini/metadata.py,sha256=osmzdNESeh7yP3BZday6N9Q3eaIHfzhhRM1d6NSgcf0,11223
|
|
10
10
|
upgini/metrics.py,sha256=Tu5cN8RlhOSSMWUTXRSkdl8SWBqR1N_2eJpBum9pZxc,30926
|
|
11
11
|
upgini/search_task.py,sha256=LtRJ9bCPjMo1gJ-sUDKERhDwGcWKImrzwVFHjkMSQHQ,17071
|
|
12
12
|
upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
|
|
@@ -15,19 +15,19 @@ upgini/ads_management/__init__.py,sha256=qzyisOToVRP-tquAJD1PblZhNtMrOB8FiyF9Jvf
|
|
|
15
15
|
upgini/ads_management/ads_manager.py,sha256=igVbN2jz80Umb2BUJixmJVj-zx8unoKpecVo-R-nGdw,2648
|
|
16
16
|
upgini/autofe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
17
17
|
upgini/autofe/all_operands.py,sha256=3LiH9iU-ArGmYpS8FHWH7yCFx40ILfvlSXJlKIa75BQ,2542
|
|
18
|
-
upgini/autofe/binary.py,sha256=
|
|
19
|
-
upgini/autofe/date.py,sha256=
|
|
20
|
-
upgini/autofe/feature.py,sha256=
|
|
21
|
-
upgini/autofe/groupby.py,sha256=
|
|
22
|
-
upgini/autofe/operand.py,sha256=
|
|
23
|
-
upgini/autofe/unary.py,sha256=
|
|
24
|
-
upgini/autofe/vector.py,sha256=
|
|
18
|
+
upgini/autofe/binary.py,sha256=xRBT7RNqQ7pprz6cRpO1KnvZCb7PvU3QXBfaP6Omqi4,7425
|
|
19
|
+
upgini/autofe/date.py,sha256=eLPrO2Cgm74VB4rPtIaeUDuI5vmLiGnygHSvU4aGHWU,9223
|
|
20
|
+
upgini/autofe/feature.py,sha256=CivPkE7YrAtDrgF8WhVPnDAnNDR8gbRQ-8_hXiQE6ew,14234
|
|
21
|
+
upgini/autofe/groupby.py,sha256=r-xl_keZZgm_tpiEoDhjYSkT6NHv7a4cRQR4wJ4uCp8,3263
|
|
22
|
+
upgini/autofe/operand.py,sha256=uk883RaNqgXqtkaRqA1re1d9OFnnpv0JVvelYx09Yw0,2943
|
|
23
|
+
upgini/autofe/unary.py,sha256=RiK-Fz3fgjPlqWWfro6x7qChjEZ8W8RTnl5-MT1kaQA,4218
|
|
24
|
+
upgini/autofe/vector.py,sha256=ehcZUDqV71TfbU8EmKfdYp603gS2dJY_-fpr10ho5sI,663
|
|
25
25
|
upgini/data_source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
26
26
|
upgini/data_source/data_source_publisher.py,sha256=Vg0biG86YB0OEaoxbK9YYrr4yARm11_h3bTWIBgoScA,22115
|
|
27
27
|
upgini/mdc/__init__.py,sha256=aM08nIWFc2gWdWUa3_IuEnNND0cQPkBGnYpRMnfFN8k,1019
|
|
28
28
|
upgini/mdc/context.py,sha256=3u1B-jXt7tXEvNcV3qmR9SDCseudnY7KYsLclBdwVLk,1405
|
|
29
29
|
upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
30
|
-
upgini/normalizer/normalize_utils.py,sha256=
|
|
30
|
+
upgini/normalizer/normalize_utils.py,sha256=bHRPWCNrUvt2R9qMX6dZFCJ0i8ENVCQ2Rw3dHH9IJEg,7447
|
|
31
31
|
upgini/resource_bundle/__init__.py,sha256=S5F2G47pnJd2LDpmFsjDqEwiKkP8Hm-hcseDbMka6Ko,8345
|
|
32
32
|
upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
|
|
33
33
|
upgini/resource_bundle/strings.properties,sha256=WZAuYPX2Dpn6BHoA3RX8uvMNMr-yJE2fF7Gz0i24x2s,26459
|
|
@@ -42,7 +42,7 @@ upgini/utils/blocked_time_series.py,sha256=Uqr3vp4YqNclj2-PzEYqVy763GSXHn86sbpIl
|
|
|
42
42
|
upgini/utils/country_utils.py,sha256=lY-eXWwFVegdVENFttbvLcgGDjFO17Sex8hd2PyJaRk,6937
|
|
43
43
|
upgini/utils/custom_loss_utils.py,sha256=kieNZYBYZm5ZGBltF1F_jOSF4ea6C29rYuCyiDcqVNY,3857
|
|
44
44
|
upgini/utils/cv_utils.py,sha256=w6FQb9nO8BWDx88EF83NpjPLarK4eR4ia0Wg0kLBJC4,3525
|
|
45
|
-
upgini/utils/datetime_utils.py,sha256=
|
|
45
|
+
upgini/utils/datetime_utils.py,sha256=4tsGeehU0KS6wqNsc9gEEWZ9s6T9E0UReUIO3rSuXNU,12174
|
|
46
46
|
upgini/utils/deduplicate_utils.py,sha256=Zvs7zW4QzaERQmJNPrTVf2ZTVBkBLOycFCzyMwtXuV8,8770
|
|
47
47
|
upgini/utils/display_utils.py,sha256=A2ouB5eiZ-Kyt9ykYxkLQwyoRPrdYeJymwNTiajtFXs,10990
|
|
48
48
|
upgini/utils/email_utils.py,sha256=j0Ug1R_0AnCg1Y92zIZ4XMwvKo3G5_pcOlBN1OH_gZs,5191
|
|
@@ -50,14 +50,14 @@ upgini/utils/fallback_progress_bar.py,sha256=PDaKb8dYpVZaWMroNcOHsTc3pSjgi9mOm0-
|
|
|
50
50
|
upgini/utils/features_validator.py,sha256=PgKNt5dyqfErTvjtRNNUS9g7GFqHBtAtnsfA-V5UO1A,3307
|
|
51
51
|
upgini/utils/format.py,sha256=Yv5cvvSs2bOLUzzNu96Pu33VMDNbabio92QepUj41jU,243
|
|
52
52
|
upgini/utils/ip_utils.py,sha256=ZZj_uQFTHhagzt-MRew__ZBOp2DdnkMrachS7PElkSE,5143
|
|
53
|
-
upgini/utils/phone_utils.py,sha256=
|
|
53
|
+
upgini/utils/phone_utils.py,sha256=IrbztLuOJBiePqqxllfABWfYlfAjYevPhXKipl95wUI,10432
|
|
54
54
|
upgini/utils/postal_code_utils.py,sha256=C899tJS8qM_ps4I3g-Ve6qzIa22O_UqwNmGFoyy9sO8,1716
|
|
55
55
|
upgini/utils/progress_bar.py,sha256=N-Sfdah2Hg8lXP_fV9EfUTXz_PyRt4lo9fAHoUDOoLc,1550
|
|
56
56
|
upgini/utils/sklearn_ext.py,sha256=13jQS_k7v0aUtudXV6nGUEWjttPQzAW9AFYL5wgEz9k,44511
|
|
57
|
-
upgini/utils/target_utils.py,sha256=
|
|
57
|
+
upgini/utils/target_utils.py,sha256=BVtDmrmFMKerSUWaNOIEdzsYHIFiODdpnWbE50QDPDc,7864
|
|
58
58
|
upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
|
|
59
59
|
upgini/utils/warning_counter.py,sha256=dIWBB4dI5XRRJZudvIlqlIYKEiwLLPcXarsZuYRt338,227
|
|
60
|
-
upgini-1.1.
|
|
61
|
-
upgini-1.1.
|
|
62
|
-
upgini-1.1.
|
|
63
|
-
upgini-1.1.
|
|
60
|
+
upgini-1.1.316a1.dist-info/METADATA,sha256=eJXt7Ga1qWst0_EIHCQYMTnAf0EkeO73tt5hbx4K_5g,48226
|
|
61
|
+
upgini-1.1.316a1.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
|
|
62
|
+
upgini-1.1.316a1.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
|
63
|
+
upgini-1.1.316a1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|