upgini 1.1.316a4__py3-none-any.whl → 1.1.317__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
upgini/__about__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "1.1.316a4"
1
+ __version__ = "1.1.317"
upgini/autofe/binary.py CHANGED
@@ -9,32 +9,32 @@ from upgini.autofe.operand import PandasOperand, VectorizableMixin
9
9
 
10
10
 
11
11
  class Min(PandasOperand):
12
- name: str = "min"
13
- is_binary: bool = True
14
- is_symmetrical: bool = True
15
- has_symmetry_importance: bool = True
12
+ name = "min"
13
+ is_binary = True
14
+ is_symmetrical = True
15
+ has_symmetry_importance = True
16
16
 
17
17
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
18
18
  return np.minimum(left, right)
19
19
 
20
20
 
21
21
  class Max(PandasOperand):
22
- name: str = "max"
23
- is_binary: bool = True
24
- is_symmetrical: bool = True
25
- has_symmetry_importance: bool = True
22
+ name = "max"
23
+ is_binary = True
24
+ is_symmetrical = True
25
+ has_symmetry_importance = True
26
26
 
27
27
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
28
28
  return np.maximum(left, right)
29
29
 
30
30
 
31
31
  class Add(PandasOperand, VectorizableMixin):
32
- name: str = "+"
33
- alias: str = "add"
34
- is_binary: bool = True
35
- is_symmetrical: bool = True
36
- has_symmetry_importance: bool = True
37
- is_vectorizable: bool = True
32
+ name = "+"
33
+ alias = "add"
34
+ is_binary = True
35
+ is_symmetrical = True
36
+ has_symmetry_importance = True
37
+ is_vectorizable = True
38
38
 
39
39
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
40
40
  return left + right
@@ -48,12 +48,12 @@ class Add(PandasOperand, VectorizableMixin):
48
48
 
49
49
 
50
50
  class Subtract(PandasOperand, VectorizableMixin):
51
- name: str = "-"
52
- alias: str = "sub"
53
- is_binary: bool = True
54
- is_symmetrical: bool = True
55
- has_symmetry_importance: bool = True
56
- is_vectorizable: bool = True
51
+ name = "-"
52
+ alias = "sub"
53
+ is_binary = True
54
+ is_symmetrical = True
55
+ has_symmetry_importance = True
56
+ is_vectorizable = True
57
57
 
58
58
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
59
59
  return left - right
@@ -67,12 +67,12 @@ class Subtract(PandasOperand, VectorizableMixin):
67
67
 
68
68
 
69
69
  class Multiply(PandasOperand, VectorizableMixin):
70
- name: str = "*"
71
- alias: str = "mul"
72
- is_binary: bool = True
73
- is_symmetrical: bool = True
74
- has_symmetry_importance: bool = True
75
- is_vectorizable: bool = True
70
+ name = "*"
71
+ alias = "mul"
72
+ is_binary = True
73
+ is_symmetrical = True
74
+ has_symmetry_importance = True
75
+ is_vectorizable = True
76
76
 
77
77
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
78
78
  return left * right
@@ -86,12 +86,12 @@ class Multiply(PandasOperand, VectorizableMixin):
86
86
 
87
87
 
88
88
  class Divide(PandasOperand, VectorizableMixin):
89
- name: str = "/"
90
- alias: str = "div"
91
- is_binary: bool = True
92
- has_symmetry_importance: bool = True
93
- is_vectorizable: bool = True
94
- output_type: Optional[str] = "float"
89
+ name = "/"
90
+ alias = "div"
91
+ is_binary = True
92
+ has_symmetry_importance = True
93
+ is_vectorizable = True
94
+ output_type = "float"
95
95
 
96
96
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
97
97
  return left / right.replace(0, np.nan)
@@ -105,10 +105,10 @@ class Divide(PandasOperand, VectorizableMixin):
105
105
 
106
106
 
107
107
  class Combine(PandasOperand):
108
- name: str = "Combine"
109
- is_binary: bool = True
110
- has_symmetry_importance: bool = True
111
- output_type: Optional[str] = "object"
108
+ name = "Combine"
109
+ is_binary = True
110
+ has_symmetry_importance = True
111
+ output_type = "object"
112
112
 
113
113
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
114
114
  temp = left.astype(str) + "_" + right.astype(str)
@@ -117,13 +117,13 @@ class Combine(PandasOperand):
117
117
 
118
118
 
119
119
  class CombineThenFreq(PandasOperand):
120
- name: str = "CombineThenFreq"
121
- is_binary: bool = True
122
- is_symmetrical: bool = True
123
- has_symmetry_importance: bool = True
124
- output_type: Optional[str] = "float"
125
- is_distribution_dependent: bool = True
126
- input_type: Optional[str] = "discrete"
120
+ name = "CombineThenFreq"
121
+ is_binary = True
122
+ is_symmetrical = True
123
+ has_symmetry_importance = True
124
+ output_type = "float"
125
+ is_distribution_dependent = True
126
+ input_type = "discrete"
127
127
 
128
128
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
129
129
  temp = left.astype(str) + "_" + right.astype(str)
@@ -133,15 +133,15 @@ class CombineThenFreq(PandasOperand):
133
133
 
134
134
 
135
135
  class Distance(PandasOperand):
136
- name: str = "dist"
137
- is_binary: bool = True
138
- output_type: Optional[str] = "float"
139
- is_symmetrical: bool = True
140
- has_symmetry_importance: bool = True
136
+ name = "dist"
137
+ is_binary = True
138
+ output_type = "float"
139
+ is_symmetrical = True
140
+ has_symmetry_importance = True
141
141
 
142
142
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
143
143
  return pd.Series(
144
- 1 - self.__dot(left, right) / (self.__dot(left, left) * self.__dot(right, right)), index=left.index
144
+ 1 - self.__dot(left, right) / (self.__norm(left) * self.__norm(right)), index=left.index
145
145
  )
146
146
 
147
147
  # row-wise dot product
@@ -152,14 +152,17 @@ class Distance(PandasOperand):
152
152
  res = res.reindex(left.index.union(right.index))
153
153
  return res
154
154
 
155
+ def __norm(self, vector: pd.Series) -> pd.Series:
156
+ return np.sqrt(self.__dot(vector, vector))
157
+
155
158
 
156
159
  # Left for backward compatibility
157
160
  class Sim(Distance):
158
- name: str = "sim"
159
- is_binary: bool = True
160
- output_type: Optional[str] = "float"
161
- is_symmetrical: bool = True
162
- has_symmetry_importance: bool = True
161
+ name = "sim"
162
+ is_binary = True
163
+ output_type = "float"
164
+ is_symmetrical = True
165
+ has_symmetry_importance = True
163
166
 
164
167
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
165
168
  return 1 - super().calculate_binary(left, right)
@@ -188,12 +191,12 @@ class StringSim(PandasOperand, abc.ABC):
188
191
 
189
192
 
190
193
  class JaroWinklerSim1(StringSim):
191
- name: str = "sim_jw1"
192
- is_binary: bool = True
193
- input_type: Optional[str] = "string"
194
- output_type: Optional[str] = "float"
195
- is_symmetrical: bool = True
196
- has_symmetry_importance: bool = True
194
+ name = "sim_jw1"
195
+ is_binary = True
196
+ input_type = "string"
197
+ output_type = "float"
198
+ is_symmetrical = True
199
+ has_symmetry_importance = True
197
200
 
198
201
  def _prepare_value(self, value: Optional[str]) -> Optional[str]:
199
202
  return value
@@ -203,12 +206,12 @@ class JaroWinklerSim1(StringSim):
203
206
 
204
207
 
205
208
  class JaroWinklerSim2(StringSim):
206
- name: str = "sim_jw2"
207
- is_binary: bool = True
208
- input_type: Optional[str] = "string"
209
- output_type: Optional[str] = "float"
210
- is_symmetrical: bool = True
211
- has_symmetry_importance: bool = True
209
+ name = "sim_jw2"
210
+ is_binary = True
211
+ input_type = "string"
212
+ output_type = "float"
213
+ is_symmetrical = True
214
+ has_symmetry_importance = True
212
215
 
213
216
  def _prepare_value(self, value: Optional[str]) -> Optional[str]:
214
217
  return value[::-1] if value is not None else None
@@ -218,12 +221,12 @@ class JaroWinklerSim2(StringSim):
218
221
 
219
222
 
220
223
  class LevenshteinSim(StringSim):
221
- name: str = "sim_lv"
222
- is_binary: bool = True
223
- input_type: Optional[str] = "string"
224
- output_type: Optional[str] = "float"
225
- is_symmetrical: bool = True
226
- has_symmetry_importance: bool = True
224
+ name = "sim_lv"
225
+ is_binary = True
226
+ input_type = "string"
227
+ output_type = "float"
228
+ is_symmetrical = True
229
+ has_symmetry_importance = True
227
230
 
228
231
  def _prepare_value(self, value: Optional[str]) -> Optional[str]:
229
232
  return value
upgini/autofe/date.py CHANGED
@@ -1,4 +1,5 @@
1
1
  import abc
2
+ import json
2
3
  from typing import Any, Dict, List, Optional, Union
3
4
 
4
5
  import numpy as np
@@ -38,10 +39,10 @@ class DateDiffMixin(BaseModel):
38
39
 
39
40
 
40
41
  class DateDiff(PandasOperand, DateDiffMixin):
41
- name: str = "date_diff"
42
- alias: Optional[str] = "date_diff_type1"
43
- is_binary: bool = True
44
- has_symmetry_importance: bool = True
42
+ name = "date_diff"
43
+ alias = "date_diff_type1"
44
+ is_binary = True
45
+ has_symmetry_importance = True
45
46
 
46
47
  replace_negative: bool = False
47
48
 
@@ -70,9 +71,9 @@ class DateDiff(PandasOperand, DateDiffMixin):
70
71
 
71
72
 
72
73
  class DateDiffType2(PandasOperand, DateDiffMixin):
73
- name: str = "date_diff_type2"
74
- is_binary: bool = True
75
- has_symmetry_importance: bool = True
74
+ name = "date_diff_type2"
75
+ is_binary = True
76
+ has_symmetry_importance = True
76
77
 
77
78
  def get_params(self) -> Dict[str, Optional[str]]:
78
79
  res = super().get_params()
@@ -104,8 +105,8 @@ _count_aggregations = ["nunique", "count"]
104
105
 
105
106
 
106
107
  class DateListDiff(PandasOperand, DateDiffMixin):
107
- is_binary: bool = True
108
- has_symmetry_importance: bool = True
108
+ is_binary = True
109
+ has_symmetry_importance = True
109
110
 
110
111
  aggregation: str
111
112
  replace_negative: bool = False
@@ -165,8 +166,8 @@ class DateListDiff(PandasOperand, DateDiffMixin):
165
166
 
166
167
 
167
168
  class DateListDiffBounded(DateListDiff):
168
- lower_bound: Optional[int] = None
169
- upper_bound: Optional[int] = None
169
+ lower_bound: Optional[int]
170
+ upper_bound: Optional[int]
170
171
 
171
172
  def __init__(self, **data: Any) -> None:
172
173
  if "name" not in data:
@@ -191,8 +192,8 @@ class DateListDiffBounded(DateListDiff):
191
192
 
192
193
 
193
194
  class DatePercentileBase(PandasOperand, abc.ABC):
194
- is_binary: bool = True
195
- output_type: Optional[str] = "float"
195
+ is_binary = True
196
+ output_type = "float"
196
197
 
197
198
  date_unit: Optional[str] = None
198
199
 
@@ -226,12 +227,12 @@ class DatePercentileBase(PandasOperand, abc.ABC):
226
227
 
227
228
 
228
229
  class DatePercentile(DatePercentileBase):
229
- name: str = "date_per"
230
- alias: Optional[str] = "date_per_method1"
230
+ name = "date_per"
231
+ alias = "date_per_method1"
231
232
 
232
- zero_month: Optional[int] = None
233
- zero_year: Optional[int] = None
234
- zero_bounds: Optional[List[float]] = None
233
+ zero_month: Optional[int]
234
+ zero_year: Optional[int]
235
+ zero_bounds: Optional[List[float]]
235
236
  step: int = 30
236
237
 
237
238
  def get_params(self) -> Dict[str, Optional[str]]:
@@ -246,12 +247,12 @@ class DatePercentile(DatePercentileBase):
246
247
  )
247
248
  return res
248
249
 
249
- @validator("zero_bounds", pre="true")
250
+ @validator("zero_bounds", pre=True)
250
251
  def validate_bounds(cls, value):
251
252
  if value is None or isinstance(value, list):
252
253
  return value
253
254
  elif isinstance(value, str):
254
- return value[1:-1].split(", ")
255
+ return json.loads(value)
255
256
 
256
257
  def _get_bounds(self, date_col: pd.Series) -> pd.Series:
257
258
  months = date_col.dt.month
@@ -264,7 +265,7 @@ class DatePercentile(DatePercentileBase):
264
265
 
265
266
 
266
267
  class DatePercentileMethod2(DatePercentileBase):
267
- name: str = "date_per_method2"
268
+ name = "date_per_method2"
268
269
 
269
270
  def _get_bounds(self, date_col: pd.Series) -> pd.Series:
270
271
  pass
upgini/autofe/groupby.py CHANGED
@@ -7,9 +7,9 @@ from upgini.autofe.operand import PandasOperand, VectorizableMixin
7
7
 
8
8
  class GroupByThenAgg(PandasOperand, VectorizableMixin):
9
9
  agg: Optional[str]
10
- is_vectorizable: bool = True
11
- is_grouping: bool = True
12
- is_distribution_dependent: bool = True
10
+ is_vectorizable = True
11
+ is_grouping = True
12
+ is_distribution_dependent = True
13
13
 
14
14
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
15
15
  temp = left.groupby(right).agg(self.agg)
@@ -24,17 +24,17 @@ class GroupByThenAgg(PandasOperand, VectorizableMixin):
24
24
 
25
25
 
26
26
  class GroupByThenMedian(GroupByThenAgg):
27
- name: str = "GroupByThenMedian"
28
- pandas_agg: str = "median"
29
- is_distribution_dependent: bool = True
27
+ name = "GroupByThenMedian"
28
+ pandas_agg = "median"
29
+ is_distribution_dependent = True
30
30
 
31
31
 
32
32
  class GroupByThenRank(PandasOperand, VectorizableMixin):
33
- name: str = "GroupByThenRank"
34
- is_vectorizable: bool = True
35
- is_grouping: bool = True
36
- output_type: Optional[str] = "float"
37
- is_distribution_dependent: bool = True
33
+ name = "GroupByThenRank"
34
+ is_vectorizable = True
35
+ is_grouping = True
36
+ output_type = "float"
37
+ is_distribution_dependent = True
38
38
 
39
39
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
40
40
  temp = pd.DataFrame(left[~right.isna()].groupby(right).rank(ascending=True, pct=True)).reset_index()
@@ -49,12 +49,12 @@ class GroupByThenRank(PandasOperand, VectorizableMixin):
49
49
 
50
50
 
51
51
  class GroupByThenNUnique(PandasOperand, VectorizableMixin):
52
- name: str = "GroupByThenNUnique"
53
- is_vectorizable: bool = True
54
- is_grouping: bool = True
55
- output_type: Optional[str] = "int"
56
- is_distribution_dependent: bool = True
57
- input_type: Optional[str] = "discrete"
52
+ name = "GroupByThenNUnique"
53
+ is_vectorizable = True
54
+ is_grouping = True
55
+ output_type = "int"
56
+ is_distribution_dependent = True
57
+ input_type = "discrete"
58
58
 
59
59
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
60
60
  nunique = left.groupby(right).nunique()
@@ -69,11 +69,11 @@ class GroupByThenNUnique(PandasOperand, VectorizableMixin):
69
69
 
70
70
 
71
71
  class GroupByThenFreq(PandasOperand):
72
- name: str = "GroupByThenFreq"
73
- is_grouping: bool = True
74
- output_type: Optional[str] = "float"
75
- is_distribution_dependent: bool = True
76
- input_type: Optional[str] = "discrete"
72
+ name = "GroupByThenFreq"
73
+ is_grouping = True
74
+ output_type = "float"
75
+ is_distribution_dependent = True
76
+ input_type = "discrete"
77
77
 
78
78
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
79
79
  def _f(x):
upgini/autofe/operand.py CHANGED
@@ -8,19 +8,19 @@ from pydantic import BaseModel
8
8
 
9
9
  class Operand(BaseModel):
10
10
  name: str
11
- alias: Optional[str] = None
11
+ alias: Optional[str]
12
12
  is_unary: bool = False
13
13
  is_symmetrical: bool = False
14
14
  has_symmetry_importance: bool = False
15
- input_type: Optional[str] = None
16
- output_type: Optional[str] = None
15
+ input_type: Optional[str]
16
+ output_type: Optional[str]
17
17
  is_categorical: bool = False
18
18
  is_vectorizable: bool = False
19
19
  is_grouping: bool = False
20
20
  is_binary: bool = False
21
21
  is_vector: bool = False
22
22
  is_distribution_dependent: bool = False
23
- params: Optional[Dict[str, str]] = None
23
+ params: Optional[Dict[str, str]]
24
24
 
25
25
  def set_params(self, params: Dict[str, str]):
26
26
  self.params = params
upgini/autofe/unary.py CHANGED
@@ -1,4 +1,3 @@
1
- from typing import Optional
2
1
  import numpy as np
3
2
  import pandas as pd
4
3
  from sklearn.preprocessing import Normalizer
@@ -7,10 +6,10 @@ from upgini.autofe.operand import PandasOperand, VectorizableMixin
7
6
 
8
7
 
9
8
  class Abs(PandasOperand, VectorizableMixin):
10
- name: str = "abs"
11
- is_unary: bool = True
12
- is_vectorizable: bool = True
13
- group_index: int = 0
9
+ name = "abs"
10
+ is_unary = True
11
+ is_vectorizable = True
12
+ group_index = 0
14
13
 
15
14
  def calculate_unary(self, data: pd.Series) -> pd.Series:
16
15
  return data.abs()
@@ -20,11 +19,11 @@ class Abs(PandasOperand, VectorizableMixin):
20
19
 
21
20
 
22
21
  class Log(PandasOperand, VectorizableMixin):
23
- name: str = "log"
24
- is_unary: bool = True
25
- is_vectorizable: bool = True
26
- output_type: Optional[str] = "float"
27
- group_index: int = 0
22
+ name = "log"
23
+ is_unary = True
24
+ is_vectorizable = True
25
+ output_type = "float"
26
+ group_index = 0
28
27
 
29
28
  def calculate_unary(self, data: pd.Series) -> pd.Series:
30
29
  return self._round_value(np.log(np.abs(data.replace(0, np.nan))), 10)
@@ -34,11 +33,11 @@ class Log(PandasOperand, VectorizableMixin):
34
33
 
35
34
 
36
35
  class Sqrt(PandasOperand, VectorizableMixin):
37
- name: str = "sqrt"
38
- is_unary: bool = True
39
- is_vectorizable: bool = True
40
- output_type: Optional[str] = "float"
41
- group_index: int = 0
36
+ name = "sqrt"
37
+ is_unary = True
38
+ is_vectorizable = True
39
+ output_type = "float"
40
+ group_index = 0
42
41
 
43
42
  def calculate_unary(self, data: pd.Series) -> pd.Series:
44
43
  return self._round_value(np.sqrt(np.abs(data)))
@@ -48,10 +47,10 @@ class Sqrt(PandasOperand, VectorizableMixin):
48
47
 
49
48
 
50
49
  class Square(PandasOperand, VectorizableMixin):
51
- name: str = "square"
52
- is_unary: bool = True
53
- is_vectorizable: bool = True
54
- group_index: int = 0
50
+ name = "square"
51
+ is_unary = True
52
+ is_vectorizable = True
53
+ group_index = 0
55
54
 
56
55
  def calculate_unary(self, data: pd.Series) -> pd.Series:
57
56
  return np.square(data)
@@ -61,11 +60,11 @@ class Square(PandasOperand, VectorizableMixin):
61
60
 
62
61
 
63
62
  class Sigmoid(PandasOperand, VectorizableMixin):
64
- name: str = "sigmoid"
65
- is_unary: bool = True
66
- is_vectorizable: bool = True
67
- output_type: Optional[str] = "float"
68
- group_index: int = 0
63
+ name = "sigmoid"
64
+ is_unary = True
65
+ is_vectorizable = True
66
+ output_type = "float"
67
+ group_index = 0
69
68
 
70
69
  def calculate_unary(self, data: pd.Series) -> pd.Series:
71
70
  return self._round_value(1 / (1 + np.exp(-data)))
@@ -75,12 +74,12 @@ class Sigmoid(PandasOperand, VectorizableMixin):
75
74
 
76
75
 
77
76
  class Floor(PandasOperand, VectorizableMixin):
78
- name: str = "floor"
79
- is_unary: bool = True
80
- is_vectorizable: bool = True
81
- output_type: Optional[str] = "int"
82
- input_type: Optional[str] = "continuous"
83
- group_index: int = 0
77
+ name = "floor"
78
+ is_unary = True
79
+ is_vectorizable = True
80
+ output_type = "int"
81
+ input_type = "continuous"
82
+ group_index = 0
84
83
 
85
84
  def calculate_unary(self, data: pd.Series) -> pd.Series:
86
85
  return np.floor(data)
@@ -90,11 +89,11 @@ class Floor(PandasOperand, VectorizableMixin):
90
89
 
91
90
 
92
91
  class Residual(PandasOperand, VectorizableMixin):
93
- name: str = "residual"
94
- is_unary: bool = True
95
- is_vectorizable: bool = True
96
- input_type: Optional[str] = "continuous"
97
- group_index: int = 0
92
+ name = "residual"
93
+ is_unary = True
94
+ is_vectorizable = True
95
+ input_type = "continuous"
96
+ group_index = 0
98
97
 
99
98
  def calculate_unary(self, data: pd.Series) -> pd.Series:
100
99
  return data - np.floor(data)
@@ -104,11 +103,11 @@ class Residual(PandasOperand, VectorizableMixin):
104
103
 
105
104
 
106
105
  class Freq(PandasOperand):
107
- name: str = "freq"
108
- is_unary: bool = True
109
- output_type: Optional[str] = "float"
110
- is_distribution_dependent: bool = True
111
- input_type: Optional[str] = "discrete"
106
+ name = "freq"
107
+ is_unary = True
108
+ output_type = "float"
109
+ is_distribution_dependent = True
110
+ input_type = "discrete"
112
111
 
113
112
  def calculate_unary(self, data: pd.Series) -> pd.Series:
114
113
  value_counts = data.value_counts(normalize=True)
@@ -116,9 +115,9 @@ class Freq(PandasOperand):
116
115
 
117
116
 
118
117
  class Norm(PandasOperand):
119
- name: str = "norm"
120
- is_unary: bool = True
121
- output_type: Optional[str] = "float"
118
+ name = "norm"
119
+ is_unary = True
120
+ output_type = "float"
122
121
 
123
122
  def calculate_unary(self, data: pd.Series) -> pd.Series:
124
123
  data_dropna = data.dropna()
@@ -132,7 +131,7 @@ class Norm(PandasOperand):
132
131
 
133
132
 
134
133
  class Embeddings(PandasOperand):
135
- name: str = "emb"
136
- is_unary: bool = True
137
- input_type: Optional[str] = "string"
138
- output_type: Optional[str] = "vector"
134
+ name = "emb"
135
+ is_unary = True
136
+ input_type = "string"
137
+ output_type = "vector"
upgini/autofe/vector.py CHANGED
@@ -1,4 +1,4 @@
1
- from typing import List, Optional
1
+ from typing import List
2
2
 
3
3
  import pandas as pd
4
4
 
@@ -6,19 +6,19 @@ from upgini.autofe.operand import PandasOperand, VectorizableMixin
6
6
 
7
7
 
8
8
  class Mean(PandasOperand, VectorizableMixin):
9
- name: str = "mean"
10
- output_type: Optional[str] = "float"
11
- is_vector: bool = True
12
- group_index: int = 0
9
+ name = "mean"
10
+ output_type = "float"
11
+ is_vector = True
12
+ group_index = 0
13
13
 
14
14
  def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
15
15
  return pd.DataFrame(data).T.fillna(0).mean(axis=1)
16
16
 
17
17
 
18
18
  class Sum(PandasOperand, VectorizableMixin):
19
- name: str = "sum"
20
- is_vector: bool = True
21
- group_index: int = 0
19
+ name = "sum"
20
+ is_vector = True
21
+ group_index = 0
22
22
 
23
23
  def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
24
24
  return pd.DataFrame(data).T.fillna(0).sum(axis=1)
@@ -23,6 +23,7 @@ from pandas.api.types import (
23
23
  is_datetime64_any_dtype,
24
24
  is_numeric_dtype,
25
25
  is_object_dtype,
26
+ is_period_dtype,
26
27
  is_string_dtype,
27
28
  )
28
29
  from scipy.stats import ks_2samp
@@ -1407,9 +1408,7 @@ class FeaturesEnricher(TransformerMixin):
1407
1408
  # TODO maybe there is no more need for these convertions
1408
1409
  # Remove datetime features
1409
1410
  datetime_features = [
1410
- f
1411
- for f in fitting_X.columns
1412
- if is_datetime64_any_dtype(fitting_X[f]) or isinstance(fitting_X[f].dtype, pd.PeriodDtype)
1411
+ f for f in fitting_X.columns if is_datetime64_any_dtype(fitting_X[f]) or is_period_dtype(fitting_X[f])
1413
1412
  ]
1414
1413
  if len(datetime_features) > 0:
1415
1414
  self.logger.warning(self.bundle.get("dataset_date_features").format(datetime_features))
upgini/http.py CHANGED
@@ -39,6 +39,18 @@ from upgini.metadata import (
39
39
  from upgini.resource_bundle import bundle
40
40
  from upgini.utils.track_info import get_track_metrics
41
41
 
42
+ # try:
43
+ # from importlib.metadata import version # type: ignore
44
+
45
+ # __version__ = version("upgini")
46
+ # except ImportError:
47
+ # try:
48
+ # from importlib_metadata import version # type: ignore
49
+
50
+ # __version__ = version("upgini")
51
+ # except ImportError:
52
+ # __version__ = "Upgini wasn't installed"
53
+
42
54
  UPGINI_URL: str = "UPGINI_URL"
43
55
  UPGINI_API_KEY: str = "UPGINI_API_KEY"
44
56
  DEMO_API_KEY: str = "Aa4BPwGFbn1zNEXIkZ-NbhsRk0ricN6puKuga1-O5lM"
@@ -440,18 +452,18 @@ class _RestClient:
440
452
  content = file.read()
441
453
  md5_hash.update(content)
442
454
  digest = md5_hash.hexdigest()
443
- metadata_with_md5 = metadata.model_copy(update={"checksumMD5": digest})
455
+ metadata_with_md5 = metadata.copy(update={"checksumMD5": digest})
444
456
 
445
457
  digest_sha256 = hashlib.sha256(
446
458
  pd.util.hash_pandas_object(pd.read_parquet(file_path, engine="fastparquet")).values
447
459
  ).hexdigest()
448
- metadata_with_md5 = metadata_with_md5.model_copy(update={"digest": digest_sha256})
460
+ metadata_with_md5 = metadata_with_md5.copy(update={"digest": digest_sha256})
449
461
 
450
462
  with open(file_path, "rb") as file:
451
463
  files = {
452
464
  "metadata": (
453
465
  "metadata.json",
454
- metadata_with_md5.model_dump_json(exclude_none=True).encode(),
466
+ metadata_with_md5.json(exclude_none=True).encode(),
455
467
  "application/json",
456
468
  ),
457
469
  "tracking": (
@@ -459,17 +471,13 @@ class _RestClient:
459
471
  dumps(track_metrics).encode(),
460
472
  "application/json",
461
473
  ),
462
- "metrics": (
463
- "metrics.json",
464
- metrics.model_dump_json(exclude_none=True).encode(),
465
- "application/json",
466
- ),
474
+ "metrics": ("metrics.json", metrics.json(exclude_none=True).encode(), "application/json"),
467
475
  "file": (metadata_with_md5.name, file, "application/octet-stream"),
468
476
  }
469
477
  if search_customization is not None:
470
478
  files["customization"] = (
471
479
  "customization.json",
472
- search_customization.model_dump_json(exclude_none=True).encode(),
480
+ search_customization.json(exclude_none=True).encode(),
473
481
  "application/json",
474
482
  )
475
483
  additional_headers = {self.SEARCH_KEYS_HEADER_NAME: ",".join(self.search_keys_meaning_types(metadata))}
@@ -484,7 +492,7 @@ class _RestClient:
484
492
  def check_uploaded_file_v2(self, trace_id: str, file_upload_id: str, metadata: FileMetadata) -> bool:
485
493
  api_path = self.CHECK_UPLOADED_FILE_URL_FMT_V2.format(file_upload_id)
486
494
  response = self._with_unauth_retry(
487
- lambda: self._send_post_req(api_path, trace_id, metadata.model_dump_json(exclude_none=True))
495
+ lambda: self._send_post_req(api_path, trace_id, metadata.json(exclude_none=True))
488
496
  )
489
497
  return bool(response)
490
498
 
@@ -498,11 +506,11 @@ class _RestClient:
498
506
  ) -> SearchTaskResponse:
499
507
  api_path = self.INITIAL_SEARCH_WITHOUT_UPLOAD_URI_FMT_V2.format(file_upload_id)
500
508
  files = {
501
- "metadata": ("metadata.json", metadata.model_dump_json(exclude_none=True).encode(), "application/json"),
502
- "metrics": ("metrics.json", metrics.model_dump_json(exclude_none=True).encode(), "application/json"),
509
+ "metadata": ("metadata.json", metadata.json(exclude_none=True).encode(), "application/json"),
510
+ "metrics": ("metrics.json", metrics.json(exclude_none=True).encode(), "application/json"),
503
511
  }
504
512
  if search_customization is not None:
505
- files["customization"] = search_customization.model_dump_json(exclude_none=True).encode()
513
+ files["customization"] = search_customization.json(exclude_none=True).encode()
506
514
  additional_headers = {self.SEARCH_KEYS_HEADER_NAME: ",".join(self.search_keys_meaning_types(metadata))}
507
515
  response = self._with_unauth_retry(
508
516
  lambda: self._send_post_file_req_v2(
@@ -528,18 +536,18 @@ class _RestClient:
528
536
  content = file.read()
529
537
  md5_hash.update(content)
530
538
  digest = md5_hash.hexdigest()
531
- metadata_with_md5 = metadata.model_copy(update={"checksumMD5": digest})
539
+ metadata_with_md5 = metadata.copy(update={"checksumMD5": digest})
532
540
 
533
541
  digest_sha256 = hashlib.sha256(
534
542
  pd.util.hash_pandas_object(pd.read_parquet(file_path, engine="fastparquet")).values
535
543
  ).hexdigest()
536
- metadata_with_md5 = metadata_with_md5.model_copy(update={"digest": digest_sha256})
544
+ metadata_with_md5 = metadata_with_md5.copy(update={"digest": digest_sha256})
537
545
 
538
546
  with open(file_path, "rb") as file:
539
547
  files = {
540
548
  "metadata": (
541
549
  "metadata.json",
542
- metadata_with_md5.model_dump_json(exclude_none=True).encode(),
550
+ metadata_with_md5.json(exclude_none=True).encode(),
543
551
  "application/json",
544
552
  ),
545
553
  "tracking": (
@@ -547,17 +555,13 @@ class _RestClient:
547
555
  dumps(get_track_metrics(self.client_ip, self.client_visitorid)).encode(),
548
556
  "application/json",
549
557
  ),
550
- "metrics": (
551
- "metrics.json",
552
- metrics.model_dump_json(exclude_none=True).encode(),
553
- "application/json",
554
- ),
558
+ "metrics": ("metrics.json", metrics.json(exclude_none=True).encode(), "application/json"),
555
559
  "file": (metadata_with_md5.name, file, "application/octet-stream"),
556
560
  }
557
561
  if search_customization is not None:
558
562
  files["customization"] = (
559
563
  "customization.json",
560
- search_customization.model_dump_json(exclude_none=True).encode(),
564
+ search_customization.json(exclude_none=True).encode(),
561
565
  "application/json",
562
566
  )
563
567
 
@@ -581,11 +585,11 @@ class _RestClient:
581
585
  ) -> SearchTaskResponse:
582
586
  api_path = self.VALIDATION_SEARCH_WITHOUT_UPLOAD_URI_FMT_V2.format(file_upload_id, initial_search_task_id)
583
587
  files = {
584
- "metadata": ("metadata.json", metadata.model_dump_json(exclude_none=True).encode(), "application/json"),
585
- "metrics": ("metrics.json", metrics.model_dump_json(exclude_none=True).encode(), "application/json"),
588
+ "metadata": ("metadata.json", metadata.json(exclude_none=True).encode(), "application/json"),
589
+ "metrics": ("metrics.json", metrics.json(exclude_none=True).encode(), "application/json"),
586
590
  }
587
591
  if search_customization is not None:
588
- files["customization"] = search_customization.model_dump_json(exclude_none=True).encode()
592
+ files["customization"] = search_customization.json(exclude_none=True).encode()
589
593
  additional_headers = {self.SEARCH_KEYS_HEADER_NAME: ",".join(self.search_keys_meaning_types(metadata))}
590
594
  response = self._with_unauth_retry(
591
595
  lambda: self._send_post_file_req_v2(
@@ -647,11 +651,7 @@ class _RestClient:
647
651
  with open(file_path, "rb") as file:
648
652
  files = {
649
653
  "file": (metadata.name, file, "application/octet-stream"),
650
- "metadata": (
651
- "metadata.json",
652
- metadata.model_dump_json(exclude_none=True).encode(),
653
- "application/json",
654
- ),
654
+ "metadata": ("metadata.json", metadata.json(exclude_none=True).encode(), "application/json"),
655
655
  }
656
656
 
657
657
  return self._send_post_file_req_v2(api_path, files)
@@ -661,12 +661,12 @@ class _RestClient:
661
661
  def get_search_file_metadata(self, search_task_id: str, trace_id: str) -> FileMetadata:
662
662
  api_path = self.SEARCH_FILE_METADATA_URI_FMT_V2.format(search_task_id)
663
663
  response = self._with_unauth_retry(lambda: self._send_get_req(api_path, trace_id))
664
- return FileMetadata.model_validate(response)
664
+ return FileMetadata.parse_obj(response)
665
665
 
666
666
  def get_provider_search_metadata_v3(self, provider_search_task_id: str, trace_id: str) -> ProviderTaskMetadataV2:
667
667
  api_path = self.SEARCH_TASK_METADATA_FMT_V3.format(provider_search_task_id)
668
668
  response = self._with_unauth_retry(lambda: self._send_get_req(api_path, trace_id))
669
- return ProviderTaskMetadataV2.model_validate(response)
669
+ return ProviderTaskMetadataV2.parse_obj(response)
670
670
 
671
671
  def get_current_transform_usage(self, trace_id) -> TransformUsage:
672
672
  track_metrics = get_track_metrics(self.client_ip, self.client_visitorid)
upgini/lazy_import.py CHANGED
@@ -1,6 +1,4 @@
1
1
  import importlib
2
- import importlib.util
3
- import importlib.machinery
4
2
 
5
3
 
6
4
  class LazyImport:
@@ -12,18 +10,7 @@ class LazyImport:
12
10
 
13
11
  def _load(self):
14
12
  if self._module is None:
15
- # Load module and save link to it
16
- spec = importlib.util.find_spec(self.module_name)
17
- if spec is None:
18
- raise ImportError(f"Module {self.module_name} not found")
19
-
20
- # Create module
21
- self._module = importlib.util.module_from_spec(spec)
22
-
23
- # Execute module
24
- spec.loader.exec_module(self._module)
25
-
26
- # Get class from module
13
+ self._module = importlib.import_module(self.module_name)
27
14
  self._class = getattr(self._module, self.class_name)
28
15
 
29
16
  def __call__(self, *args, **kwargs):
upgini/metadata.py CHANGED
@@ -1,7 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  from enum import Enum
4
- from typing import Any, Dict, List, Optional, Set, Union
4
+ from typing import Dict, List, Optional, Set, Union
5
5
 
6
6
  from pydantic import BaseModel
7
7
 
@@ -172,23 +172,23 @@ class FileMetricsInterval(BaseModel):
172
172
  date_cut: float
173
173
  count: float
174
174
  valid_count: float
175
- avg_target: Optional[float] = None # not for multiclass
176
- avg_score_etalon: Optional[float] = None
175
+ avg_target: Optional[float] # not for multiclass
176
+ avg_score_etalon: Optional[float]
177
177
 
178
178
 
179
179
  class FileMetrics(BaseModel):
180
180
  # etalon metadata
181
- task_type: Optional[ModelTaskType] = None
182
- label: Optional[ModelLabelType] = None
183
- count: Optional[int] = None
184
- valid_count: Optional[int] = None
185
- valid_rate: Optional[float] = None
186
- avg_target: Optional[float] = None
187
- metrics_binary_etalon: Optional[BinaryTask] = None
188
- metrics_regression_etalon: Optional[RegressionTask] = None
189
- metrics_multiclass_etalon: Optional[MulticlassTask] = None
190
- cuts: Optional[List[float]] = None
191
- interval: Optional[List[FileMetricsInterval]] = None
181
+ task_type: Optional[ModelTaskType]
182
+ label: Optional[ModelLabelType]
183
+ count: Optional[int]
184
+ valid_count: Optional[int]
185
+ valid_rate: Optional[float]
186
+ avg_target: Optional[float]
187
+ metrics_binary_etalon: Optional[BinaryTask]
188
+ metrics_regression_etalon: Optional[RegressionTask]
189
+ metrics_multiclass_etalon: Optional[MulticlassTask]
190
+ cuts: Optional[List[float]]
191
+ interval: Optional[List[FileMetricsInterval]]
192
192
 
193
193
 
194
194
  class NumericInterval(BaseModel):
@@ -202,25 +202,25 @@ class FileColumnMetadata(BaseModel):
202
202
  dataType: DataType
203
203
  meaningType: FileColumnMeaningType
204
204
  minMaxValues: Optional[NumericInterval] = None
205
- originalName: Optional[str] = None
205
+ originalName: Optional[str]
206
206
  # is this column contains keys from multiple key columns like msisdn1, msisdn2
207
207
  isUnnest: bool = False
208
208
  # list of original etalon key column names like msisdn1, msisdn2
209
- unnestKeyNames: Optional[List[str]] = None
209
+ unnestKeyNames: Optional[List[str]]
210
210
 
211
211
 
212
212
  class FileMetadata(BaseModel):
213
213
  name: str
214
- description: Optional[str] = None
214
+ description: Optional[str]
215
215
  columns: List[FileColumnMetadata]
216
216
  searchKeys: List[List[str]]
217
- excludeFeaturesSources: Optional[List[str]] = None
218
- hierarchicalGroupKeys: Optional[List[str]] = None
219
- hierarchicalSubgroupKeys: Optional[List[str]] = None
220
- taskType: Optional[ModelTaskType] = None
221
- rowsCount: Optional[int] = None
222
- checksumMD5: Optional[str] = None
223
- digest: Optional[str] = None
217
+ excludeFeaturesSources: Optional[List[str]]
218
+ hierarchicalGroupKeys: Optional[List[str]]
219
+ hierarchicalSubgroupKeys: Optional[List[str]]
220
+ taskType: Optional[ModelTaskType]
221
+ rowsCount: Optional[int]
222
+ checksumMD5: Optional[str]
223
+ digest: Optional[str]
224
224
 
225
225
  def column_by_name(self, name: str) -> Optional[FileColumnMetadata]:
226
226
  for c in self.columns:
@@ -244,17 +244,17 @@ class FeaturesMetadataV2(BaseModel):
244
244
  source: str
245
245
  hit_rate: float
246
246
  shap_value: float
247
- commercial_schema: Optional[str] = None
248
- data_provider: Optional[str] = None
249
- data_providers: Optional[List[str]] = None
250
- data_provider_link: Optional[str] = None
251
- data_provider_links: Optional[List[str]] = None
252
- data_source: Optional[str] = None
253
- data_sources: Optional[List[str]] = None
254
- data_source_link: Optional[str] = None
255
- data_source_links: Optional[List[str]] = None
256
- doc_link: Optional[str] = None
257
- update_frequency: Optional[str] = None
247
+ commercial_schema: Optional[str]
248
+ data_provider: Optional[str]
249
+ data_providers: Optional[List[str]]
250
+ data_provider_link: Optional[str]
251
+ data_provider_links: Optional[List[str]]
252
+ data_source: Optional[str]
253
+ data_sources: Optional[List[str]]
254
+ data_source_link: Optional[str]
255
+ data_source_links: Optional[List[str]]
256
+ doc_link: Optional[str]
257
+ update_frequency: Optional[str]
258
258
 
259
259
 
260
260
  class HitRateMetrics(BaseModel):
@@ -274,48 +274,48 @@ class ModelEvalSet(BaseModel):
274
274
  class BaseColumnMetadata(BaseModel):
275
275
  original_name: str
276
276
  hashed_name: str
277
- ads_definition_id: Optional[str] = None
277
+ ads_definition_id: Optional[str]
278
278
  is_augmented: bool
279
279
 
280
280
 
281
281
  class GeneratedFeatureMetadata(BaseModel):
282
- alias: Optional[str] = None
282
+ alias: Optional[str]
283
283
  formula: str
284
284
  display_index: str
285
285
  base_columns: List[BaseColumnMetadata]
286
- operator_params: Optional[Dict[str, str]] = None
286
+ operator_params: Optional[Dict[str, str]]
287
287
 
288
288
 
289
289
  class ProviderTaskMetadataV2(BaseModel):
290
290
  features: List[FeaturesMetadataV2]
291
- hit_rate_metrics: Optional[HitRateMetrics] = None
292
- eval_set_metrics: Optional[List[ModelEvalSet]] = None
293
- zero_hit_rate_search_keys: Optional[List[str]] = None
294
- features_used_for_embeddings: Optional[List[str]] = None
295
- shuffle_kfold: Optional[bool] = None
296
- generated_features: Optional[List[GeneratedFeatureMetadata]] = None
291
+ hit_rate_metrics: Optional[HitRateMetrics]
292
+ eval_set_metrics: Optional[List[ModelEvalSet]]
293
+ zero_hit_rate_search_keys: Optional[List[str]]
294
+ features_used_for_embeddings: Optional[List[str]]
295
+ shuffle_kfold: Optional[bool]
296
+ generated_features: Optional[List[GeneratedFeatureMetadata]]
297
297
 
298
298
 
299
299
  class FeaturesFilter(BaseModel):
300
- minImportance: Optional[float] = None
301
- maxPSI: Optional[float] = None
302
- maxCount: Optional[int] = None
303
- selectedFeatures: Optional[List[str]] = None
300
+ minImportance: Optional[float]
301
+ maxPSI: Optional[float]
302
+ maxCount: Optional[int]
303
+ selectedFeatures: Optional[List[str]]
304
304
 
305
305
 
306
306
  class RuntimeParameters(BaseModel):
307
- properties: Dict[str, Any] = {}
307
+ properties: Dict[str, str] = {}
308
308
 
309
309
 
310
310
  class SearchCustomization(BaseModel):
311
- featuresFilter: Optional[FeaturesFilter] = None
312
- extractFeatures: Optional[bool] = None
313
- accurateModel: Optional[bool] = None
314
- importanceThreshold: Optional[float] = None
315
- maxFeatures: Optional[int] = None
316
- returnScores: Optional[bool] = None
317
- runtimeParameters: Optional[RuntimeParameters] = None
318
- metricsCalculation: Optional[bool] = None
311
+ featuresFilter: Optional[FeaturesFilter]
312
+ extractFeatures: Optional[bool]
313
+ accurateModel: Optional[bool]
314
+ importanceThreshold: Optional[float]
315
+ maxFeatures: Optional[int]
316
+ returnScores: Optional[bool]
317
+ runtimeParameters: Optional[RuntimeParameters]
318
+ metricsCalculation: Optional[bool]
319
319
 
320
320
  def __repr__(self):
321
321
  return (
@@ -10,6 +10,7 @@ from pandas.api.types import (
10
10
  is_float_dtype,
11
11
  is_numeric_dtype,
12
12
  is_object_dtype,
13
+ is_period_dtype,
13
14
  is_string_dtype,
14
15
  )
15
16
 
@@ -134,7 +135,7 @@ class Normalizer:
134
135
 
135
136
  removed_features = []
136
137
  for f in features:
137
- if is_datetime(df[f]) or isinstance(df[f].dtype, pd.PeriodDtype):
138
+ if is_datetime(df[f]) or is_period_dtype(df[f]):
138
139
  removed_features.append(f)
139
140
  df.drop(columns=f, inplace=True)
140
141
 
@@ -6,7 +6,7 @@ from typing import Dict, List, Optional
6
6
  import numpy as np
7
7
  import pandas as pd
8
8
  from dateutil.relativedelta import relativedelta
9
- from pandas.api.types import is_numeric_dtype
9
+ from pandas.api.types import is_numeric_dtype, is_period_dtype
10
10
 
11
11
  from upgini.errors import ValidationError
12
12
  from upgini.metadata import EVAL_SET_INDEX, SearchKey
@@ -84,7 +84,7 @@ class DateTimeSearchKeyConverter:
84
84
  df[self.date_column] = df[self.date_column].apply(lambda x: x.replace(tzinfo=None))
85
85
  elif isinstance(df[self.date_column].values[0], datetime.date):
86
86
  df[self.date_column] = pd.to_datetime(df[self.date_column], errors="coerce")
87
- elif isinstance(df[self.date_column].dtype, pd.PeriodDtype):
87
+ elif is_period_dtype(df[self.date_column]):
88
88
  df[self.date_column] = df[self.date_column].dt.to_timestamp()
89
89
  elif is_numeric_dtype(df[self.date_column]):
90
90
  # 315532801 - 2524608001 - seconds
@@ -207,7 +207,7 @@ def is_time_series(df: pd.DataFrame, date_col: str) -> bool:
207
207
  def is_blocked_time_series(df: pd.DataFrame, date_col: str, search_keys: List[str]) -> bool:
208
208
  df = df.copy()
209
209
  seconds = "datetime_seconds"
210
- if isinstance(df[date_col].dtype, pd.PeriodDtype):
210
+ if is_period_dtype(df[date_col]):
211
211
  df[date_col] = df[date_col].dt.to_timestamp()
212
212
  else:
213
213
  df[date_col] = pd.to_datetime(df[date_col])
@@ -275,7 +275,7 @@ def validate_dates_distribution(
275
275
  if col in search_keys:
276
276
  continue
277
277
  try:
278
- if isinstance(X[col].dtype, pd.PeriodDtype):
278
+ if is_period_dtype(X[col]):
279
279
  pass
280
280
  elif pd.__version__ >= "2.0.0":
281
281
  # Format mixed to avoid massive warnings
@@ -290,7 +290,7 @@ def validate_dates_distribution(
290
290
  if maybe_date_col is None:
291
291
  return
292
292
 
293
- if isinstance(X[maybe_date_col].dtype, pd.PeriodDtype):
293
+ if is_period_dtype(X[maybe_date_col]):
294
294
  dates = X[maybe_date_col].dt.to_timestamp().dt.date
295
295
  elif pd.__version__ >= "2.0.0":
296
296
  dates = pd.to_datetime(X[maybe_date_col], format="mixed").dt.date
@@ -1,8 +1,12 @@
1
1
  from typing import Optional
2
2
 
3
- import numpy as np
4
3
  import pandas as pd
5
- from pandas.api.types import is_float_dtype, is_object_dtype, is_string_dtype
4
+ from pandas.api.types import (
5
+ is_float_dtype,
6
+ is_int64_dtype,
7
+ is_object_dtype,
8
+ is_string_dtype,
9
+ )
6
10
 
7
11
  from upgini.errors import ValidationError
8
12
  from upgini.utils.base_search_key_detector import BaseSearchKeyDetector
@@ -59,9 +63,7 @@ class PhoneSearchKeyConverter:
59
63
  convert_func = self.phone_str_to_int_safe
60
64
  elif is_float_dtype(df[self.phone_column]):
61
65
  convert_func = self.phone_float_to_int_safe
62
- elif df[self.phone_column].dtype == np.int64 or isinstance(
63
- df[self.phone_column].dtype, pd.Int64Dtype
64
- ):
66
+ elif is_int64_dtype(df[self.phone_column]):
65
67
  convert_func = self.phone_int_to_int_safe
66
68
  else:
67
69
  raise ValidationError(
@@ -194,7 +194,4 @@ def calculate_psi(expected: pd.Series, actual: pd.Series) -> float:
194
194
  test_distribution = actual.value_counts(bins=bins, normalize=True).sort_index().values
195
195
 
196
196
  # Calculate the PSI
197
- try:
198
- return np.sum((train_distribution - test_distribution) * np.log(train_distribution / test_distribution))
199
- except Exception:
200
- return np.nan
197
+ return np.sum((train_distribution - test_distribution) * np.log(train_distribution / test_distribution))
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.1.316a4
3
+ Version: 1.1.317
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -29,9 +29,9 @@ Requires-Dist: ipywidgets>=8.1.0
29
29
  Requires-Dist: jarowinkler>=2.0.0
30
30
  Requires-Dist: levenshtein>=0.25.1
31
31
  Requires-Dist: lightgbm>=3.3.2
32
- Requires-Dist: numpy<=1.26.4,>=1.19.0
32
+ Requires-Dist: numpy>=1.19.0
33
33
  Requires-Dist: pandas<3.0.0,>=1.1.0
34
- Requires-Dist: pydantic<3.0.0,>1.0.0
34
+ Requires-Dist: pydantic<2.0.0,>=1.8.2
35
35
  Requires-Dist: pyjwt>=2.8.0
36
36
  Requires-Dist: python-bidi==0.4.2
37
37
  Requires-Dist: python-dateutil>=2.8.0
@@ -1,12 +1,12 @@
1
- upgini/__about__.py,sha256=i-OBw28RkpSvTIOZTSuyNEDzto-65EgHHD5fh416j-8,26
1
+ upgini/__about__.py,sha256=7A4Mpkf8cSUSzwIJzMaQ6hlkjN2sldlyOHl5dtLNJkE,24
2
2
  upgini/__init__.py,sha256=Xs0YFVBu1KUdtZzbStGRPQtLt3YLzJnjx5nIUBlX8BE,415
3
3
  upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
4
4
  upgini/dataset.py,sha256=yAWIygHejxdKXOA4g3QjtCu0VRa9at-4nPPuugCr77U,30857
5
5
  upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
6
- upgini/features_enricher.py,sha256=_d8ya5RRoYN0o6mV6gda-bLdOngQ4rb1SA51SlM_TG0,188002
7
- upgini/http.py,sha256=gCN5ru_I6JNHk-m6-Ckjhd23iMzOAzDSLb0tSEcxkC4,43068
8
- upgini/lazy_import.py,sha256=74gQ8JuA48BGRLxAo7lNHNKY2D2emMxrUxKGdxVGhuY,1012
9
- upgini/metadata.py,sha256=osmzdNESeh7yP3BZday6N9Q3eaIHfzhhRM1d6NSgcf0,11223
6
+ upgini/features_enricher.py,sha256=Gu4gsnMVjcsfWnJlu4Np3jpE9Au1UywhuHQb0Xv5YNg,187982
7
+ upgini/http.py,sha256=a4Epc9YLIJBuYk4t8E_2-QDLBtJFqKO35jn2SnYQZCg,42920
8
+ upgini/lazy_import.py,sha256=EwoM0msNGbSmWBhGbrLDny1DSnOlvTxCjmMKPxYlDms,610
9
+ upgini/metadata.py,sha256=YQ-1HZGyPOksP2iM50ff_pMHXLyzvpChqSfNh8Z0ke4,10833
10
10
  upgini/metrics.py,sha256=Tu5cN8RlhOSSMWUTXRSkdl8SWBqR1N_2eJpBum9pZxc,30926
11
11
  upgini/search_task.py,sha256=LtRJ9bCPjMo1gJ-sUDKERhDwGcWKImrzwVFHjkMSQHQ,17071
12
12
  upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
@@ -15,19 +15,19 @@ upgini/ads_management/__init__.py,sha256=qzyisOToVRP-tquAJD1PblZhNtMrOB8FiyF9Jvf
15
15
  upgini/ads_management/ads_manager.py,sha256=igVbN2jz80Umb2BUJixmJVj-zx8unoKpecVo-R-nGdw,2648
16
16
  upgini/autofe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
17
  upgini/autofe/all_operands.py,sha256=3LiH9iU-ArGmYpS8FHWH7yCFx40ILfvlSXJlKIa75BQ,2542
18
- upgini/autofe/binary.py,sha256=xRBT7RNqQ7pprz6cRpO1KnvZCb7PvU3QXBfaP6Omqi4,7425
19
- upgini/autofe/date.py,sha256=aKuEsguYSrFdFiLd6tBLVH4TiQ3JFMo_49_Ajp8eKQg,9208
18
+ upgini/autofe/binary.py,sha256=2Z5FrfdCtesKEHBuabEBiRvwOAzcRoFKAX1wvGpHL0I,7003
19
+ upgini/autofe/date.py,sha256=ijB9RCh5wBwl03Nl8zDYA50gpL4sqmAkYVYzVPm1bn0,9070
20
20
  upgini/autofe/feature.py,sha256=gwGWY2UcX_0wHAvfEiu1rRU7GFZyzMWZIaPVcf6kD80,14223
21
- upgini/autofe/groupby.py,sha256=r-xl_keZZgm_tpiEoDhjYSkT6NHv7a4cRQR4wJ4uCp8,3263
22
- upgini/autofe/operand.py,sha256=uk883RaNqgXqtkaRqA1re1d9OFnnpv0JVvelYx09Yw0,2943
23
- upgini/autofe/unary.py,sha256=RiK-Fz3fgjPlqWWfro6x7qChjEZ8W8RTnl5-MT1kaQA,4218
24
- upgini/autofe/vector.py,sha256=ehcZUDqV71TfbU8EmKfdYp603gS2dJY_-fpr10ho5sI,663
21
+ upgini/autofe/groupby.py,sha256=4WjDzQxqpZxB79Ih4ihMMI5GDxaFqiH6ZelfV82ClT4,3091
22
+ upgini/autofe/operand.py,sha256=MKEsl3zxpWzRDpTkE0sNJxTu62U20sWOvEKhPjUWS6s,2915
23
+ upgini/autofe/unary.py,sha256=oIMf-IVy7L7GkzxMmQyExX0tOH9RhWeQh7cGxxMDiPk,3832
24
+ upgini/autofe/vector.py,sha256=dLxfAstJs-gw_OQ1xxoxcM6pVzORlV0HVzdzt7cLXVQ,606
25
25
  upgini/data_source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
26
26
  upgini/data_source/data_source_publisher.py,sha256=Vg0biG86YB0OEaoxbK9YYrr4yARm11_h3bTWIBgoScA,22115
27
27
  upgini/mdc/__init__.py,sha256=aM08nIWFc2gWdWUa3_IuEnNND0cQPkBGnYpRMnfFN8k,1019
28
28
  upgini/mdc/context.py,sha256=3u1B-jXt7tXEvNcV3qmR9SDCseudnY7KYsLclBdwVLk,1405
29
29
  upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
30
- upgini/normalizer/normalize_utils.py,sha256=bHRPWCNrUvt2R9qMX6dZFCJ0i8ENVCQ2Rw3dHH9IJEg,7447
30
+ upgini/normalizer/normalize_utils.py,sha256=8gH1oabPNZrC1kHSRFxGGcO0o6yNDlOJXCLzzExq-3s,7451
31
31
  upgini/resource_bundle/__init__.py,sha256=S5F2G47pnJd2LDpmFsjDqEwiKkP8Hm-hcseDbMka6Ko,8345
32
32
  upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
33
33
  upgini/resource_bundle/strings.properties,sha256=WZAuYPX2Dpn6BHoA3RX8uvMNMr-yJE2fF7Gz0i24x2s,26459
@@ -42,7 +42,7 @@ upgini/utils/blocked_time_series.py,sha256=Uqr3vp4YqNclj2-PzEYqVy763GSXHn86sbpIl
42
42
  upgini/utils/country_utils.py,sha256=lY-eXWwFVegdVENFttbvLcgGDjFO17Sex8hd2PyJaRk,6937
43
43
  upgini/utils/custom_loss_utils.py,sha256=kieNZYBYZm5ZGBltF1F_jOSF4ea6C29rYuCyiDcqVNY,3857
44
44
  upgini/utils/cv_utils.py,sha256=w6FQb9nO8BWDx88EF83NpjPLarK4eR4ia0Wg0kLBJC4,3525
45
- upgini/utils/datetime_utils.py,sha256=4tsGeehU0KS6wqNsc9gEEWZ9s6T9E0UReUIO3rSuXNU,12174
45
+ upgini/utils/datetime_utils.py,sha256=niZcf2YqAwokUFUW474zajlzv9HAMf7nv9v_WPJHpyc,12123
46
46
  upgini/utils/deduplicate_utils.py,sha256=Zvs7zW4QzaERQmJNPrTVf2ZTVBkBLOycFCzyMwtXuV8,8770
47
47
  upgini/utils/display_utils.py,sha256=A2ouB5eiZ-Kyt9ykYxkLQwyoRPrdYeJymwNTiajtFXs,10990
48
48
  upgini/utils/email_utils.py,sha256=j0Ug1R_0AnCg1Y92zIZ4XMwvKo3G5_pcOlBN1OH_gZs,5191
@@ -50,14 +50,14 @@ upgini/utils/fallback_progress_bar.py,sha256=PDaKb8dYpVZaWMroNcOHsTc3pSjgi9mOm0-
50
50
  upgini/utils/features_validator.py,sha256=PgKNt5dyqfErTvjtRNNUS9g7GFqHBtAtnsfA-V5UO1A,3307
51
51
  upgini/utils/format.py,sha256=Yv5cvvSs2bOLUzzNu96Pu33VMDNbabio92QepUj41jU,243
52
52
  upgini/utils/ip_utils.py,sha256=ZZj_uQFTHhagzt-MRew__ZBOp2DdnkMrachS7PElkSE,5143
53
- upgini/utils/phone_utils.py,sha256=IrbztLuOJBiePqqxllfABWfYlfAjYevPhXKipl95wUI,10432
53
+ upgini/utils/phone_utils.py,sha256=PTSRfGAWCuLy8R6I8X6clcc1K7bZXIIrZ_alIB8irC8,10368
54
54
  upgini/utils/postal_code_utils.py,sha256=C899tJS8qM_ps4I3g-Ve6qzIa22O_UqwNmGFoyy9sO8,1716
55
55
  upgini/utils/progress_bar.py,sha256=N-Sfdah2Hg8lXP_fV9EfUTXz_PyRt4lo9fAHoUDOoLc,1550
56
56
  upgini/utils/sklearn_ext.py,sha256=13jQS_k7v0aUtudXV6nGUEWjttPQzAW9AFYL5wgEz9k,44511
57
- upgini/utils/target_utils.py,sha256=BVtDmrmFMKerSUWaNOIEdzsYHIFiODdpnWbE50QDPDc,7864
57
+ upgini/utils/target_utils.py,sha256=Y96_PJ5cC-WsEbeqg20v9uqywDQobLoTb-xoP7S3o4E,7807
58
58
  upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
59
59
  upgini/utils/warning_counter.py,sha256=dIWBB4dI5XRRJZudvIlqlIYKEiwLLPcXarsZuYRt338,227
60
- upgini-1.1.316a4.dist-info/METADATA,sha256=fz3VauXEbpF-_XYIVwL3rESOeDF3kV8HSluaOUBN1hE,48232
61
- upgini-1.1.316a4.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
62
- upgini-1.1.316a4.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
63
- upgini-1.1.316a4.dist-info/RECORD,,
60
+ upgini-1.1.317.dist-info/METADATA,sha256=MAx5zlya3JBerLBEmC9me552zgexw4gy4Cfc2VuNzSg,48222
61
+ upgini-1.1.317.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
62
+ upgini-1.1.317.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
63
+ upgini-1.1.317.dist-info/RECORD,,