upgini 1.1.316__py3-none-any.whl → 1.1.316a2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

upgini/__about__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "1.1.316"
1
+ __version__ = "1.1.316a2"
upgini/autofe/binary.py CHANGED
@@ -9,32 +9,32 @@ from upgini.autofe.operand import PandasOperand, VectorizableMixin
9
9
 
10
10
 
11
11
  class Min(PandasOperand):
12
- name = "min"
13
- is_binary = True
14
- is_symmetrical = True
15
- has_symmetry_importance = True
12
+ name: str = "min"
13
+ is_binary: bool = True
14
+ is_symmetrical: bool = True
15
+ has_symmetry_importance: bool = True
16
16
 
17
17
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
18
18
  return np.minimum(left, right)
19
19
 
20
20
 
21
21
  class Max(PandasOperand):
22
- name = "max"
23
- is_binary = True
24
- is_symmetrical = True
25
- has_symmetry_importance = True
22
+ name: str = "max"
23
+ is_binary: bool = True
24
+ is_symmetrical: bool = True
25
+ has_symmetry_importance: bool = True
26
26
 
27
27
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
28
28
  return np.maximum(left, right)
29
29
 
30
30
 
31
31
  class Add(PandasOperand, VectorizableMixin):
32
- name = "+"
33
- alias = "add"
34
- is_binary = True
35
- is_symmetrical = True
36
- has_symmetry_importance = True
37
- is_vectorizable = True
32
+ name: str = "+"
33
+ alias: str = "add"
34
+ is_binary: bool = True
35
+ is_symmetrical: bool = True
36
+ has_symmetry_importance: bool = True
37
+ is_vectorizable: bool = True
38
38
 
39
39
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
40
40
  return left + right
@@ -48,12 +48,12 @@ class Add(PandasOperand, VectorizableMixin):
48
48
 
49
49
 
50
50
  class Subtract(PandasOperand, VectorizableMixin):
51
- name = "-"
52
- alias = "sub"
53
- is_binary = True
54
- is_symmetrical = True
55
- has_symmetry_importance = True
56
- is_vectorizable = True
51
+ name: str = "-"
52
+ alias: str = "sub"
53
+ is_binary: bool = True
54
+ is_symmetrical: bool = True
55
+ has_symmetry_importance: bool = True
56
+ is_vectorizable: bool = True
57
57
 
58
58
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
59
59
  return left - right
@@ -67,12 +67,12 @@ class Subtract(PandasOperand, VectorizableMixin):
67
67
 
68
68
 
69
69
  class Multiply(PandasOperand, VectorizableMixin):
70
- name = "*"
71
- alias = "mul"
72
- is_binary = True
73
- is_symmetrical = True
74
- has_symmetry_importance = True
75
- is_vectorizable = True
70
+ name: str = "*"
71
+ alias: str = "mul"
72
+ is_binary: bool = True
73
+ is_symmetrical: bool = True
74
+ has_symmetry_importance: bool = True
75
+ is_vectorizable: bool = True
76
76
 
77
77
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
78
78
  return left * right
@@ -86,12 +86,12 @@ class Multiply(PandasOperand, VectorizableMixin):
86
86
 
87
87
 
88
88
  class Divide(PandasOperand, VectorizableMixin):
89
- name = "/"
90
- alias = "div"
91
- is_binary = True
92
- has_symmetry_importance = True
93
- is_vectorizable = True
94
- output_type = "float"
89
+ name: str = "/"
90
+ alias: str = "div"
91
+ is_binary: bool = True
92
+ has_symmetry_importance: bool = True
93
+ is_vectorizable: bool = True
94
+ output_type: Optional[str] = "float"
95
95
 
96
96
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
97
97
  return left / right.replace(0, np.nan)
@@ -105,10 +105,10 @@ class Divide(PandasOperand, VectorizableMixin):
105
105
 
106
106
 
107
107
  class Combine(PandasOperand):
108
- name = "Combine"
109
- is_binary = True
110
- has_symmetry_importance = True
111
- output_type = "object"
108
+ name: str = "Combine"
109
+ is_binary: bool = True
110
+ has_symmetry_importance: bool = True
111
+ output_type: Optional[str] = "object"
112
112
 
113
113
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
114
114
  temp = left.astype(str) + "_" + right.astype(str)
@@ -117,13 +117,13 @@ class Combine(PandasOperand):
117
117
 
118
118
 
119
119
  class CombineThenFreq(PandasOperand):
120
- name = "CombineThenFreq"
121
- is_binary = True
122
- is_symmetrical = True
123
- has_symmetry_importance = True
124
- output_type = "float"
125
- is_distribution_dependent = True
126
- input_type = "discrete"
120
+ name: str = "CombineThenFreq"
121
+ is_binary: bool = True
122
+ is_symmetrical: bool = True
123
+ has_symmetry_importance: bool = True
124
+ output_type: Optional[str] = "float"
125
+ is_distribution_dependent: bool = True
126
+ input_type: Optional[str] = "discrete"
127
127
 
128
128
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
129
129
  temp = left.astype(str) + "_" + right.astype(str)
@@ -133,15 +133,15 @@ class CombineThenFreq(PandasOperand):
133
133
 
134
134
 
135
135
  class Distance(PandasOperand):
136
- name = "dist"
137
- is_binary = True
138
- output_type = "float"
139
- is_symmetrical = True
140
- has_symmetry_importance = True
136
+ name: str = "dist"
137
+ is_binary: bool = True
138
+ output_type: Optional[str] = "float"
139
+ is_symmetrical: bool = True
140
+ has_symmetry_importance: bool = True
141
141
 
142
142
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
143
143
  return pd.Series(
144
- 1 - self.__dot(left, right) / (self.__norm(left) * self.__norm(right)), index=left.index
144
+ 1 - self.__dot(left, right) / (self.__dot(left, left) * self.__dot(right, right)), index=left.index
145
145
  )
146
146
 
147
147
  # row-wise dot product
@@ -152,17 +152,14 @@ class Distance(PandasOperand):
152
152
  res = res.reindex(left.index.union(right.index))
153
153
  return res
154
154
 
155
- def __norm(self, vector: pd.Series) -> pd.Series:
156
- return np.sqrt(self.__dot(vector, vector))
157
-
158
155
 
159
156
  # Left for backward compatibility
160
157
  class Sim(Distance):
161
- name = "sim"
162
- is_binary = True
163
- output_type = "float"
164
- is_symmetrical = True
165
- has_symmetry_importance = True
158
+ name: str = "sim"
159
+ is_binary: bool = True
160
+ output_type: Optional[str] = "float"
161
+ is_symmetrical: bool = True
162
+ has_symmetry_importance: bool = True
166
163
 
167
164
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
168
165
  return 1 - super().calculate_binary(left, right)
@@ -191,12 +188,12 @@ class StringSim(PandasOperand, abc.ABC):
191
188
 
192
189
 
193
190
  class JaroWinklerSim1(StringSim):
194
- name = "sim_jw1"
195
- is_binary = True
196
- input_type = "string"
197
- output_type = "float"
198
- is_symmetrical = True
199
- has_symmetry_importance = True
191
+ name: str = "sim_jw1"
192
+ is_binary: bool = True
193
+ input_type: Optional[str] = "string"
194
+ output_type: Optional[str] = "float"
195
+ is_symmetrical: bool = True
196
+ has_symmetry_importance: bool = True
200
197
 
201
198
  def _prepare_value(self, value: Optional[str]) -> Optional[str]:
202
199
  return value
@@ -206,12 +203,12 @@ class JaroWinklerSim1(StringSim):
206
203
 
207
204
 
208
205
  class JaroWinklerSim2(StringSim):
209
- name = "sim_jw2"
210
- is_binary = True
211
- input_type = "string"
212
- output_type = "float"
213
- is_symmetrical = True
214
- has_symmetry_importance = True
206
+ name: str = "sim_jw2"
207
+ is_binary: bool = True
208
+ input_type: Optional[str] = "string"
209
+ output_type: Optional[str] = "float"
210
+ is_symmetrical: bool = True
211
+ has_symmetry_importance: bool = True
215
212
 
216
213
  def _prepare_value(self, value: Optional[str]) -> Optional[str]:
217
214
  return value[::-1] if value is not None else None
@@ -221,12 +218,12 @@ class JaroWinklerSim2(StringSim):
221
218
 
222
219
 
223
220
  class LevenshteinSim(StringSim):
224
- name = "sim_lv"
225
- is_binary = True
226
- input_type = "string"
227
- output_type = "float"
228
- is_symmetrical = True
229
- has_symmetry_importance = True
221
+ name: str = "sim_lv"
222
+ is_binary: bool = True
223
+ input_type: Optional[str] = "string"
224
+ output_type: Optional[str] = "float"
225
+ is_symmetrical: bool = True
226
+ has_symmetry_importance: bool = True
230
227
 
231
228
  def _prepare_value(self, value: Optional[str]) -> Optional[str]:
232
229
  return value
upgini/autofe/date.py CHANGED
@@ -4,7 +4,7 @@ from typing import Any, Dict, List, Optional, Union
4
4
  import numpy as np
5
5
  import pandas as pd
6
6
  from pandas.core.arrays.timedeltas import TimedeltaArray
7
- from pydantic import BaseModel, validator
7
+ from pydantic import BaseModel, field_validator
8
8
 
9
9
  from upgini.autofe.operand import PandasOperand
10
10
 
@@ -38,10 +38,10 @@ class DateDiffMixin(BaseModel):
38
38
 
39
39
 
40
40
  class DateDiff(PandasOperand, DateDiffMixin):
41
- name = "date_diff"
42
- alias = "date_diff_type1"
43
- is_binary = True
44
- has_symmetry_importance = True
41
+ name: str = "date_diff"
42
+ alias: Optional[str] = "date_diff_type1"
43
+ is_binary: bool = True
44
+ has_symmetry_importance: bool = True
45
45
 
46
46
  replace_negative: bool = False
47
47
 
@@ -70,9 +70,9 @@ class DateDiff(PandasOperand, DateDiffMixin):
70
70
 
71
71
 
72
72
  class DateDiffType2(PandasOperand, DateDiffMixin):
73
- name = "date_diff_type2"
74
- is_binary = True
75
- has_symmetry_importance = True
73
+ name: str = "date_diff_type2"
74
+ is_binary: bool = True
75
+ has_symmetry_importance: bool = True
76
76
 
77
77
  def get_params(self) -> Dict[str, Optional[str]]:
78
78
  res = super().get_params()
@@ -104,8 +104,8 @@ _count_aggregations = ["nunique", "count"]
104
104
 
105
105
 
106
106
  class DateListDiff(PandasOperand, DateDiffMixin):
107
- is_binary = True
108
- has_symmetry_importance = True
107
+ is_binary: bool = True
108
+ has_symmetry_importance: bool = True
109
109
 
110
110
  aggregation: str
111
111
  replace_negative: bool = False
@@ -165,8 +165,8 @@ class DateListDiff(PandasOperand, DateDiffMixin):
165
165
 
166
166
 
167
167
  class DateListDiffBounded(DateListDiff):
168
- lower_bound: Optional[int]
169
- upper_bound: Optional[int]
168
+ lower_bound: Optional[int] = None
169
+ upper_bound: Optional[int] = None
170
170
 
171
171
  def __init__(self, **data: Any) -> None:
172
172
  if "name" not in data:
@@ -191,8 +191,8 @@ class DateListDiffBounded(DateListDiff):
191
191
 
192
192
 
193
193
  class DatePercentileBase(PandasOperand, abc.ABC):
194
- is_binary = True
195
- output_type = "float"
194
+ is_binary: bool = True
195
+ output_type: Optional[str] = "float"
196
196
 
197
197
  date_unit: Optional[str] = None
198
198
 
@@ -226,12 +226,12 @@ class DatePercentileBase(PandasOperand, abc.ABC):
226
226
 
227
227
 
228
228
  class DatePercentile(DatePercentileBase):
229
- name = "date_per"
230
- alias = "date_per_method1"
229
+ name: str = "date_per"
230
+ alias: Optional[str] = "date_per_method1"
231
231
 
232
- zero_month: Optional[int]
233
- zero_year: Optional[int]
234
- zero_bounds: Optional[List[float]]
232
+ zero_month: Optional[int] = None
233
+ zero_year: Optional[int] = None
234
+ zero_bounds: Optional[List[float]] = None
235
235
  step: int = 30
236
236
 
237
237
  def get_params(self) -> Dict[str, Optional[str]]:
@@ -246,7 +246,7 @@ class DatePercentile(DatePercentileBase):
246
246
  )
247
247
  return res
248
248
 
249
- @validator("zero_bounds", pre=True)
249
+ @field_validator("zero_bounds", mode="before")
250
250
  def validate_bounds(cls, value):
251
251
  if value is None or isinstance(value, list):
252
252
  return value
@@ -264,7 +264,7 @@ class DatePercentile(DatePercentileBase):
264
264
 
265
265
 
266
266
  class DatePercentileMethod2(DatePercentileBase):
267
- name = "date_per_method2"
267
+ name: str = "date_per_method2"
268
268
 
269
269
  def _get_bounds(self, date_col: pd.Series) -> pd.Series:
270
270
  pass
upgini/autofe/feature.py CHANGED
@@ -82,9 +82,9 @@ class Feature:
82
82
  self.alias = alias
83
83
 
84
84
  def set_op_params(self, params: Optional[Dict[str, str]]) -> "Feature":
85
- obj_dict = self.op.dict().copy()
85
+ obj_dict = self.op.model_dump().copy()
86
86
  obj_dict.update(params or {})
87
- self.op = self.op.__class__.parse_obj(obj_dict)
87
+ self.op = self.op.__class__.model_validate(obj_dict)
88
88
  self.op.set_params(params)
89
89
 
90
90
  for child in self.children:
upgini/autofe/groupby.py CHANGED
@@ -7,9 +7,9 @@ from upgini.autofe.operand import PandasOperand, VectorizableMixin
7
7
 
8
8
  class GroupByThenAgg(PandasOperand, VectorizableMixin):
9
9
  agg: Optional[str]
10
- is_vectorizable = True
11
- is_grouping = True
12
- is_distribution_dependent = True
10
+ is_vectorizable: bool = True
11
+ is_grouping: bool = True
12
+ is_distribution_dependent: bool = True
13
13
 
14
14
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
15
15
  temp = left.groupby(right).agg(self.agg)
@@ -24,17 +24,17 @@ class GroupByThenAgg(PandasOperand, VectorizableMixin):
24
24
 
25
25
 
26
26
  class GroupByThenMedian(GroupByThenAgg):
27
- name = "GroupByThenMedian"
28
- pandas_agg = "median"
29
- is_distribution_dependent = True
27
+ name: str = "GroupByThenMedian"
28
+ pandas_agg: str = "median"
29
+ is_distribution_dependent: bool = True
30
30
 
31
31
 
32
32
  class GroupByThenRank(PandasOperand, VectorizableMixin):
33
- name = "GroupByThenRank"
34
- is_vectorizable = True
35
- is_grouping = True
36
- output_type = "float"
37
- is_distribution_dependent = True
33
+ name: str = "GroupByThenRank"
34
+ is_vectorizable: bool = True
35
+ is_grouping: bool = True
36
+ output_type: Optional[str] = "float"
37
+ is_distribution_dependent: bool = True
38
38
 
39
39
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
40
40
  temp = pd.DataFrame(left[~right.isna()].groupby(right).rank(ascending=True, pct=True)).reset_index()
@@ -49,12 +49,12 @@ class GroupByThenRank(PandasOperand, VectorizableMixin):
49
49
 
50
50
 
51
51
  class GroupByThenNUnique(PandasOperand, VectorizableMixin):
52
- name = "GroupByThenNUnique"
53
- is_vectorizable = True
54
- is_grouping = True
55
- output_type = "int"
56
- is_distribution_dependent = True
57
- input_type = "discrete"
52
+ name: str = "GroupByThenNUnique"
53
+ is_vectorizable: bool = True
54
+ is_grouping: bool = True
55
+ output_type: Optional[str] = "int"
56
+ is_distribution_dependent: bool = True
57
+ input_type: Optional[str] = "discrete"
58
58
 
59
59
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
60
60
  nunique = left.groupby(right).nunique()
@@ -69,11 +69,11 @@ class GroupByThenNUnique(PandasOperand, VectorizableMixin):
69
69
 
70
70
 
71
71
  class GroupByThenFreq(PandasOperand):
72
- name = "GroupByThenFreq"
73
- is_grouping = True
74
- output_type = "float"
75
- is_distribution_dependent = True
76
- input_type = "discrete"
72
+ name: str = "GroupByThenFreq"
73
+ is_grouping: bool = True
74
+ output_type: Optional[str] = "float"
75
+ is_distribution_dependent: bool = True
76
+ input_type: Optional[str] = "discrete"
77
77
 
78
78
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
79
79
  def _f(x):
upgini/autofe/operand.py CHANGED
@@ -8,19 +8,19 @@ from pydantic import BaseModel
8
8
 
9
9
  class Operand(BaseModel):
10
10
  name: str
11
- alias: Optional[str]
11
+ alias: Optional[str] = None
12
12
  is_unary: bool = False
13
13
  is_symmetrical: bool = False
14
14
  has_symmetry_importance: bool = False
15
- input_type: Optional[str]
16
- output_type: Optional[str]
15
+ input_type: Optional[str] = None
16
+ output_type: Optional[str] = None
17
17
  is_categorical: bool = False
18
18
  is_vectorizable: bool = False
19
19
  is_grouping: bool = False
20
20
  is_binary: bool = False
21
21
  is_vector: bool = False
22
22
  is_distribution_dependent: bool = False
23
- params: Optional[Dict[str, str]]
23
+ params: Optional[Dict[str, str]] = None
24
24
 
25
25
  def set_params(self, params: Dict[str, str]):
26
26
  self.params = params
upgini/autofe/unary.py CHANGED
@@ -1,3 +1,4 @@
1
+ from typing import Optional
1
2
  import numpy as np
2
3
  import pandas as pd
3
4
  from sklearn.preprocessing import Normalizer
@@ -6,10 +7,10 @@ from upgini.autofe.operand import PandasOperand, VectorizableMixin
6
7
 
7
8
 
8
9
  class Abs(PandasOperand, VectorizableMixin):
9
- name = "abs"
10
- is_unary = True
11
- is_vectorizable = True
12
- group_index = 0
10
+ name: str = "abs"
11
+ is_unary: bool = True
12
+ is_vectorizable: bool = True
13
+ group_index: int = 0
13
14
 
14
15
  def calculate_unary(self, data: pd.Series) -> pd.Series:
15
16
  return data.abs()
@@ -19,11 +20,11 @@ class Abs(PandasOperand, VectorizableMixin):
19
20
 
20
21
 
21
22
  class Log(PandasOperand, VectorizableMixin):
22
- name = "log"
23
- is_unary = True
24
- is_vectorizable = True
25
- output_type = "float"
26
- group_index = 0
23
+ name: str = "log"
24
+ is_unary: bool = True
25
+ is_vectorizable: bool = True
26
+ output_type: Optional[str] = "float"
27
+ group_index: int = 0
27
28
 
28
29
  def calculate_unary(self, data: pd.Series) -> pd.Series:
29
30
  return self._round_value(np.log(np.abs(data.replace(0, np.nan))), 10)
@@ -33,11 +34,11 @@ class Log(PandasOperand, VectorizableMixin):
33
34
 
34
35
 
35
36
  class Sqrt(PandasOperand, VectorizableMixin):
36
- name = "sqrt"
37
- is_unary = True
38
- is_vectorizable = True
39
- output_type = "float"
40
- group_index = 0
37
+ name: str = "sqrt"
38
+ is_unary: bool = True
39
+ is_vectorizable: bool = True
40
+ output_type: Optional[str] = "float"
41
+ group_index: int = 0
41
42
 
42
43
  def calculate_unary(self, data: pd.Series) -> pd.Series:
43
44
  return self._round_value(np.sqrt(np.abs(data)))
@@ -47,10 +48,10 @@ class Sqrt(PandasOperand, VectorizableMixin):
47
48
 
48
49
 
49
50
  class Square(PandasOperand, VectorizableMixin):
50
- name = "square"
51
- is_unary = True
52
- is_vectorizable = True
53
- group_index = 0
51
+ name: str = "square"
52
+ is_unary: bool = True
53
+ is_vectorizable: bool = True
54
+ group_index: int = 0
54
55
 
55
56
  def calculate_unary(self, data: pd.Series) -> pd.Series:
56
57
  return np.square(data)
@@ -60,11 +61,11 @@ class Square(PandasOperand, VectorizableMixin):
60
61
 
61
62
 
62
63
  class Sigmoid(PandasOperand, VectorizableMixin):
63
- name = "sigmoid"
64
- is_unary = True
65
- is_vectorizable = True
66
- output_type = "float"
67
- group_index = 0
64
+ name: str = "sigmoid"
65
+ is_unary: bool = True
66
+ is_vectorizable: bool = True
67
+ output_type: Optional[str] = "float"
68
+ group_index: int = 0
68
69
 
69
70
  def calculate_unary(self, data: pd.Series) -> pd.Series:
70
71
  return self._round_value(1 / (1 + np.exp(-data)))
@@ -74,12 +75,12 @@ class Sigmoid(PandasOperand, VectorizableMixin):
74
75
 
75
76
 
76
77
  class Floor(PandasOperand, VectorizableMixin):
77
- name = "floor"
78
- is_unary = True
79
- is_vectorizable = True
80
- output_type = "int"
81
- input_type = "continuous"
82
- group_index = 0
78
+ name: str = "floor"
79
+ is_unary: bool = True
80
+ is_vectorizable: bool = True
81
+ output_type: Optional[str] = "int"
82
+ input_type: Optional[str] = "continuous"
83
+ group_index: int = 0
83
84
 
84
85
  def calculate_unary(self, data: pd.Series) -> pd.Series:
85
86
  return np.floor(data)
@@ -89,11 +90,11 @@ class Floor(PandasOperand, VectorizableMixin):
89
90
 
90
91
 
91
92
  class Residual(PandasOperand, VectorizableMixin):
92
- name = "residual"
93
- is_unary = True
94
- is_vectorizable = True
95
- input_type = "continuous"
96
- group_index = 0
93
+ name: str = "residual"
94
+ is_unary: bool = True
95
+ is_vectorizable: bool = True
96
+ input_type: Optional[str] = "continuous"
97
+ group_index: int = 0
97
98
 
98
99
  def calculate_unary(self, data: pd.Series) -> pd.Series:
99
100
  return data - np.floor(data)
@@ -103,11 +104,11 @@ class Residual(PandasOperand, VectorizableMixin):
103
104
 
104
105
 
105
106
  class Freq(PandasOperand):
106
- name = "freq"
107
- is_unary = True
108
- output_type = "float"
109
- is_distribution_dependent = True
110
- input_type = "discrete"
107
+ name: str = "freq"
108
+ is_unary: bool = True
109
+ output_type: Optional[str] = "float"
110
+ is_distribution_dependent: bool = True
111
+ input_type: Optional[str] = "discrete"
111
112
 
112
113
  def calculate_unary(self, data: pd.Series) -> pd.Series:
113
114
  value_counts = data.value_counts(normalize=True)
@@ -115,9 +116,9 @@ class Freq(PandasOperand):
115
116
 
116
117
 
117
118
  class Norm(PandasOperand):
118
- name = "norm"
119
- is_unary = True
120
- output_type = "float"
119
+ name: str = "norm"
120
+ is_unary: bool = True
121
+ output_type: Optional[str] = "float"
121
122
 
122
123
  def calculate_unary(self, data: pd.Series) -> pd.Series:
123
124
  data_dropna = data.dropna()
@@ -131,7 +132,7 @@ class Norm(PandasOperand):
131
132
 
132
133
 
133
134
  class Embeddings(PandasOperand):
134
- name = "emb"
135
- is_unary = True
136
- input_type = "string"
137
- output_type = "vector"
135
+ name: str = "emb"
136
+ is_unary: bool = True
137
+ input_type: Optional[str] = "string"
138
+ output_type: Optional[str] = "vector"
upgini/autofe/vector.py CHANGED
@@ -1,4 +1,4 @@
1
- from typing import List
1
+ from typing import List, Optional
2
2
 
3
3
  import pandas as pd
4
4
 
@@ -6,19 +6,19 @@ from upgini.autofe.operand import PandasOperand, VectorizableMixin
6
6
 
7
7
 
8
8
  class Mean(PandasOperand, VectorizableMixin):
9
- name = "mean"
10
- output_type = "float"
11
- is_vector = True
12
- group_index = 0
9
+ name: str = "mean"
10
+ output_type: Optional[str] = "float"
11
+ is_vector: bool = True
12
+ group_index: int = 0
13
13
 
14
14
  def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
15
15
  return pd.DataFrame(data).T.fillna(0).mean(axis=1)
16
16
 
17
17
 
18
18
  class Sum(PandasOperand, VectorizableMixin):
19
- name = "sum"
20
- is_vector = True
21
- group_index = 0
19
+ name: str = "sum"
20
+ is_vector: bool = True
21
+ group_index: int = 0
22
22
 
23
23
  def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
24
24
  return pd.DataFrame(data).T.fillna(0).sum(axis=1)
@@ -23,7 +23,6 @@ from pandas.api.types import (
23
23
  is_datetime64_any_dtype,
24
24
  is_numeric_dtype,
25
25
  is_object_dtype,
26
- is_period_dtype,
27
26
  is_string_dtype,
28
27
  )
29
28
  from scipy.stats import ks_2samp
@@ -1408,7 +1407,9 @@ class FeaturesEnricher(TransformerMixin):
1408
1407
  # TODO maybe there is no more need for these convertions
1409
1408
  # Remove datetime features
1410
1409
  datetime_features = [
1411
- f for f in fitting_X.columns if is_datetime64_any_dtype(fitting_X[f]) or is_period_dtype(fitting_X[f])
1410
+ f
1411
+ for f in fitting_X.columns
1412
+ if is_datetime64_any_dtype(fitting_X[f]) or isinstance(fitting_X[f].dtype, pd.PeriodDtype)
1412
1413
  ]
1413
1414
  if len(datetime_features) > 0:
1414
1415
  self.logger.warning(self.bundle.get("dataset_date_features").format(datetime_features))
upgini/http.py CHANGED
@@ -39,18 +39,6 @@ from upgini.metadata import (
39
39
  from upgini.resource_bundle import bundle
40
40
  from upgini.utils.track_info import get_track_metrics
41
41
 
42
- # try:
43
- # from importlib.metadata import version # type: ignore
44
-
45
- # __version__ = version("upgini")
46
- # except ImportError:
47
- # try:
48
- # from importlib_metadata import version # type: ignore
49
-
50
- # __version__ = version("upgini")
51
- # except ImportError:
52
- # __version__ = "Upgini wasn't installed"
53
-
54
42
  UPGINI_URL: str = "UPGINI_URL"
55
43
  UPGINI_API_KEY: str = "UPGINI_API_KEY"
56
44
  DEMO_API_KEY: str = "Aa4BPwGFbn1zNEXIkZ-NbhsRk0ricN6puKuga1-O5lM"
@@ -452,18 +440,18 @@ class _RestClient:
452
440
  content = file.read()
453
441
  md5_hash.update(content)
454
442
  digest = md5_hash.hexdigest()
455
- metadata_with_md5 = metadata.copy(update={"checksumMD5": digest})
443
+ metadata_with_md5 = metadata.model_copy(update={"checksumMD5": digest})
456
444
 
457
445
  digest_sha256 = hashlib.sha256(
458
446
  pd.util.hash_pandas_object(pd.read_parquet(file_path, engine="fastparquet")).values
459
447
  ).hexdigest()
460
- metadata_with_md5 = metadata_with_md5.copy(update={"digest": digest_sha256})
448
+ metadata_with_md5 = metadata_with_md5.model_copy(update={"digest": digest_sha256})
461
449
 
462
450
  with open(file_path, "rb") as file:
463
451
  files = {
464
452
  "metadata": (
465
453
  "metadata.json",
466
- metadata_with_md5.json(exclude_none=True).encode(),
454
+ metadata_with_md5.model_dump_json(exclude_none=True).encode(),
467
455
  "application/json",
468
456
  ),
469
457
  "tracking": (
@@ -471,13 +459,17 @@ class _RestClient:
471
459
  dumps(track_metrics).encode(),
472
460
  "application/json",
473
461
  ),
474
- "metrics": ("metrics.json", metrics.json(exclude_none=True).encode(), "application/json"),
462
+ "metrics": (
463
+ "metrics.json",
464
+ metrics.model_dump_json(exclude_none=True).encode(),
465
+ "application/json",
466
+ ),
475
467
  "file": (metadata_with_md5.name, file, "application/octet-stream"),
476
468
  }
477
469
  if search_customization is not None:
478
470
  files["customization"] = (
479
471
  "customization.json",
480
- search_customization.json(exclude_none=True).encode(),
472
+ search_customization.model_dump_json(exclude_none=True).encode(),
481
473
  "application/json",
482
474
  )
483
475
  additional_headers = {self.SEARCH_KEYS_HEADER_NAME: ",".join(self.search_keys_meaning_types(metadata))}
@@ -492,7 +484,7 @@ class _RestClient:
492
484
  def check_uploaded_file_v2(self, trace_id: str, file_upload_id: str, metadata: FileMetadata) -> bool:
493
485
  api_path = self.CHECK_UPLOADED_FILE_URL_FMT_V2.format(file_upload_id)
494
486
  response = self._with_unauth_retry(
495
- lambda: self._send_post_req(api_path, trace_id, metadata.json(exclude_none=True))
487
+ lambda: self._send_post_req(api_path, trace_id, metadata.model_dump_json(exclude_none=True))
496
488
  )
497
489
  return bool(response)
498
490
 
@@ -506,11 +498,11 @@ class _RestClient:
506
498
  ) -> SearchTaskResponse:
507
499
  api_path = self.INITIAL_SEARCH_WITHOUT_UPLOAD_URI_FMT_V2.format(file_upload_id)
508
500
  files = {
509
- "metadata": ("metadata.json", metadata.json(exclude_none=True).encode(), "application/json"),
510
- "metrics": ("metrics.json", metrics.json(exclude_none=True).encode(), "application/json"),
501
+ "metadata": ("metadata.json", metadata.model_dump_json(exclude_none=True).encode(), "application/json"),
502
+ "metrics": ("metrics.json", metrics.model_dump_json(exclude_none=True).encode(), "application/json"),
511
503
  }
512
504
  if search_customization is not None:
513
- files["customization"] = search_customization.json(exclude_none=True).encode()
505
+ files["customization"] = search_customization.model_dump_json(exclude_none=True).encode()
514
506
  additional_headers = {self.SEARCH_KEYS_HEADER_NAME: ",".join(self.search_keys_meaning_types(metadata))}
515
507
  response = self._with_unauth_retry(
516
508
  lambda: self._send_post_file_req_v2(
@@ -536,18 +528,18 @@ class _RestClient:
536
528
  content = file.read()
537
529
  md5_hash.update(content)
538
530
  digest = md5_hash.hexdigest()
539
- metadata_with_md5 = metadata.copy(update={"checksumMD5": digest})
531
+ metadata_with_md5 = metadata.model_copy(update={"checksumMD5": digest})
540
532
 
541
533
  digest_sha256 = hashlib.sha256(
542
534
  pd.util.hash_pandas_object(pd.read_parquet(file_path, engine="fastparquet")).values
543
535
  ).hexdigest()
544
- metadata_with_md5 = metadata_with_md5.copy(update={"digest": digest_sha256})
536
+ metadata_with_md5 = metadata_with_md5.model_copy(update={"digest": digest_sha256})
545
537
 
546
538
  with open(file_path, "rb") as file:
547
539
  files = {
548
540
  "metadata": (
549
541
  "metadata.json",
550
- metadata_with_md5.json(exclude_none=True).encode(),
542
+ metadata_with_md5.model_dump_json(exclude_none=True).encode(),
551
543
  "application/json",
552
544
  ),
553
545
  "tracking": (
@@ -555,13 +547,17 @@ class _RestClient:
555
547
  dumps(get_track_metrics(self.client_ip, self.client_visitorid)).encode(),
556
548
  "application/json",
557
549
  ),
558
- "metrics": ("metrics.json", metrics.json(exclude_none=True).encode(), "application/json"),
550
+ "metrics": (
551
+ "metrics.json",
552
+ metrics.model_dump_json(exclude_none=True).encode(),
553
+ "application/json",
554
+ ),
559
555
  "file": (metadata_with_md5.name, file, "application/octet-stream"),
560
556
  }
561
557
  if search_customization is not None:
562
558
  files["customization"] = (
563
559
  "customization.json",
564
- search_customization.json(exclude_none=True).encode(),
560
+ search_customization.model_dump_json(exclude_none=True).encode(),
565
561
  "application/json",
566
562
  )
567
563
 
@@ -585,11 +581,11 @@ class _RestClient:
585
581
  ) -> SearchTaskResponse:
586
582
  api_path = self.VALIDATION_SEARCH_WITHOUT_UPLOAD_URI_FMT_V2.format(file_upload_id, initial_search_task_id)
587
583
  files = {
588
- "metadata": ("metadata.json", metadata.json(exclude_none=True).encode(), "application/json"),
589
- "metrics": ("metrics.json", metrics.json(exclude_none=True).encode(), "application/json"),
584
+ "metadata": ("metadata.json", metadata.model_dump_json(exclude_none=True).encode(), "application/json"),
585
+ "metrics": ("metrics.json", metrics.model_dump_json(exclude_none=True).encode(), "application/json"),
590
586
  }
591
587
  if search_customization is not None:
592
- files["customization"] = search_customization.json(exclude_none=True).encode()
588
+ files["customization"] = search_customization.model_dump_json(exclude_none=True).encode()
593
589
  additional_headers = {self.SEARCH_KEYS_HEADER_NAME: ",".join(self.search_keys_meaning_types(metadata))}
594
590
  response = self._with_unauth_retry(
595
591
  lambda: self._send_post_file_req_v2(
@@ -651,7 +647,11 @@ class _RestClient:
651
647
  with open(file_path, "rb") as file:
652
648
  files = {
653
649
  "file": (metadata.name, file, "application/octet-stream"),
654
- "metadata": ("metadata.json", metadata.json(exclude_none=True).encode(), "application/json"),
650
+ "metadata": (
651
+ "metadata.json",
652
+ metadata.model_dump_json(exclude_none=True).encode(),
653
+ "application/json",
654
+ ),
655
655
  }
656
656
 
657
657
  return self._send_post_file_req_v2(api_path, files)
@@ -661,12 +661,12 @@ class _RestClient:
661
661
  def get_search_file_metadata(self, search_task_id: str, trace_id: str) -> FileMetadata:
662
662
  api_path = self.SEARCH_FILE_METADATA_URI_FMT_V2.format(search_task_id)
663
663
  response = self._with_unauth_retry(lambda: self._send_get_req(api_path, trace_id))
664
- return FileMetadata.parse_obj(response)
664
+ return FileMetadata.model_validate(response)
665
665
 
666
666
  def get_provider_search_metadata_v3(self, provider_search_task_id: str, trace_id: str) -> ProviderTaskMetadataV2:
667
667
  api_path = self.SEARCH_TASK_METADATA_FMT_V3.format(provider_search_task_id)
668
668
  response = self._with_unauth_retry(lambda: self._send_get_req(api_path, trace_id))
669
- return ProviderTaskMetadataV2.parse_obj(response)
669
+ return ProviderTaskMetadataV2.model_validate(response)
670
670
 
671
671
  def get_current_transform_usage(self, trace_id) -> TransformUsage:
672
672
  track_metrics = get_track_metrics(self.client_ip, self.client_visitorid)
upgini/lazy_import.py CHANGED
@@ -1,4 +1,6 @@
1
1
  import importlib
2
+ import importlib.util
3
+ import importlib.machinery
2
4
 
3
5
 
4
6
  class LazyImport:
@@ -10,7 +12,18 @@ class LazyImport:
10
12
 
11
13
  def _load(self):
12
14
  if self._module is None:
13
- self._module = importlib.import_module(self.module_name)
15
+ # Load module and save link to it
16
+ spec = importlib.util.find_spec(self.module_name)
17
+ if spec is None:
18
+ raise ImportError(f"Module {self.module_name} not found")
19
+
20
+ # Create module
21
+ self._module = importlib.util.module_from_spec(spec)
22
+
23
+ # Execute module
24
+ spec.loader.exec_module(self._module)
25
+
26
+ # Get class from module
14
27
  self._class = getattr(self._module, self.class_name)
15
28
 
16
29
  def __call__(self, *args, **kwargs):
upgini/metadata.py CHANGED
@@ -1,7 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  from enum import Enum
4
- from typing import Dict, List, Optional, Set, Union
4
+ from typing import Any, Dict, List, Optional, Set, Union
5
5
 
6
6
  from pydantic import BaseModel
7
7
 
@@ -172,23 +172,23 @@ class FileMetricsInterval(BaseModel):
172
172
  date_cut: float
173
173
  count: float
174
174
  valid_count: float
175
- avg_target: Optional[float] # not for multiclass
176
- avg_score_etalon: Optional[float]
175
+ avg_target: Optional[float] = None # not for multiclass
176
+ avg_score_etalon: Optional[float] = None
177
177
 
178
178
 
179
179
  class FileMetrics(BaseModel):
180
180
  # etalon metadata
181
- task_type: Optional[ModelTaskType]
182
- label: Optional[ModelLabelType]
183
- count: Optional[int]
184
- valid_count: Optional[int]
185
- valid_rate: Optional[float]
186
- avg_target: Optional[float]
187
- metrics_binary_etalon: Optional[BinaryTask]
188
- metrics_regression_etalon: Optional[RegressionTask]
189
- metrics_multiclass_etalon: Optional[MulticlassTask]
190
- cuts: Optional[List[float]]
191
- interval: Optional[List[FileMetricsInterval]]
181
+ task_type: Optional[ModelTaskType] = None
182
+ label: Optional[ModelLabelType] = None
183
+ count: Optional[int] = None
184
+ valid_count: Optional[int] = None
185
+ valid_rate: Optional[float] = None
186
+ avg_target: Optional[float] = None
187
+ metrics_binary_etalon: Optional[BinaryTask] = None
188
+ metrics_regression_etalon: Optional[RegressionTask] = None
189
+ metrics_multiclass_etalon: Optional[MulticlassTask] = None
190
+ cuts: Optional[List[float]] = None
191
+ interval: Optional[List[FileMetricsInterval]] = None
192
192
 
193
193
 
194
194
  class NumericInterval(BaseModel):
@@ -202,25 +202,25 @@ class FileColumnMetadata(BaseModel):
202
202
  dataType: DataType
203
203
  meaningType: FileColumnMeaningType
204
204
  minMaxValues: Optional[NumericInterval] = None
205
- originalName: Optional[str]
205
+ originalName: Optional[str] = None
206
206
  # is this column contains keys from multiple key columns like msisdn1, msisdn2
207
207
  isUnnest: bool = False
208
208
  # list of original etalon key column names like msisdn1, msisdn2
209
- unnestKeyNames: Optional[List[str]]
209
+ unnestKeyNames: Optional[List[str]] = None
210
210
 
211
211
 
212
212
  class FileMetadata(BaseModel):
213
213
  name: str
214
- description: Optional[str]
214
+ description: Optional[str] = None
215
215
  columns: List[FileColumnMetadata]
216
216
  searchKeys: List[List[str]]
217
- excludeFeaturesSources: Optional[List[str]]
218
- hierarchicalGroupKeys: Optional[List[str]]
219
- hierarchicalSubgroupKeys: Optional[List[str]]
220
- taskType: Optional[ModelTaskType]
221
- rowsCount: Optional[int]
222
- checksumMD5: Optional[str]
223
- digest: Optional[str]
217
+ excludeFeaturesSources: Optional[List[str]] = None
218
+ hierarchicalGroupKeys: Optional[List[str]] = None
219
+ hierarchicalSubgroupKeys: Optional[List[str]] = None
220
+ taskType: Optional[ModelTaskType] = None
221
+ rowsCount: Optional[int] = None
222
+ checksumMD5: Optional[str] = None
223
+ digest: Optional[str] = None
224
224
 
225
225
  def column_by_name(self, name: str) -> Optional[FileColumnMetadata]:
226
226
  for c in self.columns:
@@ -244,17 +244,17 @@ class FeaturesMetadataV2(BaseModel):
244
244
  source: str
245
245
  hit_rate: float
246
246
  shap_value: float
247
- commercial_schema: Optional[str]
248
- data_provider: Optional[str]
249
- data_providers: Optional[List[str]]
250
- data_provider_link: Optional[str]
251
- data_provider_links: Optional[List[str]]
252
- data_source: Optional[str]
253
- data_sources: Optional[List[str]]
254
- data_source_link: Optional[str]
255
- data_source_links: Optional[List[str]]
256
- doc_link: Optional[str]
257
- update_frequency: Optional[str]
247
+ commercial_schema: Optional[str] = None
248
+ data_provider: Optional[str] = None
249
+ data_providers: Optional[List[str]] = None
250
+ data_provider_link: Optional[str] = None
251
+ data_provider_links: Optional[List[str]] = None
252
+ data_source: Optional[str] = None
253
+ data_sources: Optional[List[str]] = None
254
+ data_source_link: Optional[str] = None
255
+ data_source_links: Optional[List[str]] = None
256
+ doc_link: Optional[str] = None
257
+ update_frequency: Optional[str] = None
258
258
 
259
259
 
260
260
  class HitRateMetrics(BaseModel):
@@ -274,48 +274,48 @@ class ModelEvalSet(BaseModel):
274
274
  class BaseColumnMetadata(BaseModel):
275
275
  original_name: str
276
276
  hashed_name: str
277
- ads_definition_id: Optional[str]
277
+ ads_definition_id: Optional[str] = None
278
278
  is_augmented: bool
279
279
 
280
280
 
281
281
  class GeneratedFeatureMetadata(BaseModel):
282
- alias: Optional[str]
282
+ alias: Optional[str] = None
283
283
  formula: str
284
284
  display_index: str
285
285
  base_columns: List[BaseColumnMetadata]
286
- operator_params: Optional[Dict[str, str]]
286
+ operator_params: Optional[Dict[str, str]] = None
287
287
 
288
288
 
289
289
  class ProviderTaskMetadataV2(BaseModel):
290
290
  features: List[FeaturesMetadataV2]
291
- hit_rate_metrics: Optional[HitRateMetrics]
292
- eval_set_metrics: Optional[List[ModelEvalSet]]
293
- zero_hit_rate_search_keys: Optional[List[str]]
294
- features_used_for_embeddings: Optional[List[str]]
295
- shuffle_kfold: Optional[bool]
296
- generated_features: Optional[List[GeneratedFeatureMetadata]]
291
+ hit_rate_metrics: Optional[HitRateMetrics] = None
292
+ eval_set_metrics: Optional[List[ModelEvalSet]] = None
293
+ zero_hit_rate_search_keys: Optional[List[str]] = None
294
+ features_used_for_embeddings: Optional[List[str]] = None
295
+ shuffle_kfold: Optional[bool] = None
296
+ generated_features: Optional[List[GeneratedFeatureMetadata]] = None
297
297
 
298
298
 
299
299
  class FeaturesFilter(BaseModel):
300
- minImportance: Optional[float]
301
- maxPSI: Optional[float]
302
- maxCount: Optional[int]
303
- selectedFeatures: Optional[List[str]]
300
+ minImportance: Optional[float] = None
301
+ maxPSI: Optional[float] = None
302
+ maxCount: Optional[int] = None
303
+ selectedFeatures: Optional[List[str]] = None
304
304
 
305
305
 
306
306
  class RuntimeParameters(BaseModel):
307
- properties: Dict[str, str] = {}
307
+ properties: Dict[str, Any] = {}
308
308
 
309
309
 
310
310
  class SearchCustomization(BaseModel):
311
- featuresFilter: Optional[FeaturesFilter]
312
- extractFeatures: Optional[bool]
313
- accurateModel: Optional[bool]
314
- importanceThreshold: Optional[float]
315
- maxFeatures: Optional[int]
316
- returnScores: Optional[bool]
317
- runtimeParameters: Optional[RuntimeParameters]
318
- metricsCalculation: Optional[bool]
311
+ featuresFilter: Optional[FeaturesFilter] = None
312
+ extractFeatures: Optional[bool] = None
313
+ accurateModel: Optional[bool] = None
314
+ importanceThreshold: Optional[float] = None
315
+ maxFeatures: Optional[int] = None
316
+ returnScores: Optional[bool] = None
317
+ runtimeParameters: Optional[RuntimeParameters] = None
318
+ metricsCalculation: Optional[bool] = None
319
319
 
320
320
  def __repr__(self):
321
321
  return (
@@ -10,7 +10,6 @@ from pandas.api.types import (
10
10
  is_float_dtype,
11
11
  is_numeric_dtype,
12
12
  is_object_dtype,
13
- is_period_dtype,
14
13
  is_string_dtype,
15
14
  )
16
15
 
@@ -135,7 +134,7 @@ class Normalizer:
135
134
 
136
135
  removed_features = []
137
136
  for f in features:
138
- if is_datetime(df[f]) or is_period_dtype(df[f]):
137
+ if is_datetime(df[f]) or isinstance(df[f].dtype, pd.PeriodDtype):
139
138
  removed_features.append(f)
140
139
  df.drop(columns=f, inplace=True)
141
140
 
@@ -6,7 +6,7 @@ from typing import Dict, List, Optional
6
6
  import numpy as np
7
7
  import pandas as pd
8
8
  from dateutil.relativedelta import relativedelta
9
- from pandas.api.types import is_numeric_dtype, is_period_dtype
9
+ from pandas.api.types import is_numeric_dtype
10
10
 
11
11
  from upgini.errors import ValidationError
12
12
  from upgini.metadata import EVAL_SET_INDEX, SearchKey
@@ -84,7 +84,7 @@ class DateTimeSearchKeyConverter:
84
84
  df[self.date_column] = df[self.date_column].apply(lambda x: x.replace(tzinfo=None))
85
85
  elif isinstance(df[self.date_column].values[0], datetime.date):
86
86
  df[self.date_column] = pd.to_datetime(df[self.date_column], errors="coerce")
87
- elif is_period_dtype(df[self.date_column]):
87
+ elif isinstance(df[self.date_column].dtype, pd.PeriodDtype):
88
88
  df[self.date_column] = df[self.date_column].dt.to_timestamp()
89
89
  elif is_numeric_dtype(df[self.date_column]):
90
90
  # 315532801 - 2524608001 - seconds
@@ -207,7 +207,7 @@ def is_time_series(df: pd.DataFrame, date_col: str) -> bool:
207
207
  def is_blocked_time_series(df: pd.DataFrame, date_col: str, search_keys: List[str]) -> bool:
208
208
  df = df.copy()
209
209
  seconds = "datetime_seconds"
210
- if is_period_dtype(df[date_col]):
210
+ if isinstance(df[date_col].dtype, pd.PeriodDtype):
211
211
  df[date_col] = df[date_col].dt.to_timestamp()
212
212
  else:
213
213
  df[date_col] = pd.to_datetime(df[date_col])
@@ -275,7 +275,7 @@ def validate_dates_distribution(
275
275
  if col in search_keys:
276
276
  continue
277
277
  try:
278
- if is_period_dtype(X[col]):
278
+ if isinstance(X[col].dtype, pd.PeriodDtype):
279
279
  pass
280
280
  elif pd.__version__ >= "2.0.0":
281
281
  # Format mixed to avoid massive warnings
@@ -290,7 +290,7 @@ def validate_dates_distribution(
290
290
  if maybe_date_col is None:
291
291
  return
292
292
 
293
- if is_period_dtype(X[maybe_date_col]):
293
+ if isinstance(X[maybe_date_col].dtype, pd.PeriodDtype):
294
294
  dates = X[maybe_date_col].dt.to_timestamp().dt.date
295
295
  elif pd.__version__ >= "2.0.0":
296
296
  dates = pd.to_datetime(X[maybe_date_col], format="mixed").dt.date
@@ -1,12 +1,8 @@
1
1
  from typing import Optional
2
2
 
3
+ import numpy as np
3
4
  import pandas as pd
4
- from pandas.api.types import (
5
- is_float_dtype,
6
- is_int64_dtype,
7
- is_object_dtype,
8
- is_string_dtype,
9
- )
5
+ from pandas.api.types import is_float_dtype, is_object_dtype, is_string_dtype
10
6
 
11
7
  from upgini.errors import ValidationError
12
8
  from upgini.utils.base_search_key_detector import BaseSearchKeyDetector
@@ -63,7 +59,9 @@ class PhoneSearchKeyConverter:
63
59
  convert_func = self.phone_str_to_int_safe
64
60
  elif is_float_dtype(df[self.phone_column]):
65
61
  convert_func = self.phone_float_to_int_safe
66
- elif is_int64_dtype(df[self.phone_column]):
62
+ elif df[self.phone_column].dtype == np.int64 or isinstance(
63
+ df[self.phone_column].dtype, pd.Int64Dtype
64
+ ):
67
65
  convert_func = self.phone_int_to_int_safe
68
66
  else:
69
67
  raise ValidationError(
@@ -194,4 +194,7 @@ def calculate_psi(expected: pd.Series, actual: pd.Series) -> float:
194
194
  test_distribution = actual.value_counts(bins=bins, normalize=True).sort_index().values
195
195
 
196
196
  # Calculate the PSI
197
- return np.sum((train_distribution - test_distribution) * np.log(train_distribution / test_distribution))
197
+ try:
198
+ return np.sum((train_distribution - test_distribution) * np.log(train_distribution / test_distribution))
199
+ except Exception:
200
+ return np.nan
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.1.316
3
+ Version: 1.1.316a2
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -29,9 +29,9 @@ Requires-Dist: ipywidgets>=8.1.0
29
29
  Requires-Dist: jarowinkler>=2.0.0
30
30
  Requires-Dist: levenshtein>=0.25.1
31
31
  Requires-Dist: lightgbm>=3.3.2
32
- Requires-Dist: numpy>=1.19.0
32
+ Requires-Dist: numpy<=1.26.4,>=1.19.0
33
33
  Requires-Dist: pandas<3.0.0,>=1.1.0
34
- Requires-Dist: pydantic<2.0.0,>=1.8.2
34
+ Requires-Dist: pydantic<3.0.0,>1.0.0
35
35
  Requires-Dist: pyjwt>=2.8.0
36
36
  Requires-Dist: python-bidi==0.4.2
37
37
  Requires-Dist: python-dateutil>=2.8.0
@@ -1,12 +1,12 @@
1
- upgini/__about__.py,sha256=DQCLPSfZIiyKQ88S6JJcAEA3dURvJk2NhtYNJeB5Mq8,24
1
+ upgini/__about__.py,sha256=lnBx1YP_mYM1XVyjFPE_kJqGG8UiGn3zcRQt1R0zSbY,26
2
2
  upgini/__init__.py,sha256=Xs0YFVBu1KUdtZzbStGRPQtLt3YLzJnjx5nIUBlX8BE,415
3
3
  upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
4
4
  upgini/dataset.py,sha256=yAWIygHejxdKXOA4g3QjtCu0VRa9at-4nPPuugCr77U,30857
5
5
  upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
6
- upgini/features_enricher.py,sha256=Gu4gsnMVjcsfWnJlu4Np3jpE9Au1UywhuHQb0Xv5YNg,187982
7
- upgini/http.py,sha256=a4Epc9YLIJBuYk4t8E_2-QDLBtJFqKO35jn2SnYQZCg,42920
8
- upgini/lazy_import.py,sha256=EwoM0msNGbSmWBhGbrLDny1DSnOlvTxCjmMKPxYlDms,610
9
- upgini/metadata.py,sha256=YQ-1HZGyPOksP2iM50ff_pMHXLyzvpChqSfNh8Z0ke4,10833
6
+ upgini/features_enricher.py,sha256=_d8ya5RRoYN0o6mV6gda-bLdOngQ4rb1SA51SlM_TG0,188002
7
+ upgini/http.py,sha256=gCN5ru_I6JNHk-m6-Ckjhd23iMzOAzDSLb0tSEcxkC4,43068
8
+ upgini/lazy_import.py,sha256=74gQ8JuA48BGRLxAo7lNHNKY2D2emMxrUxKGdxVGhuY,1012
9
+ upgini/metadata.py,sha256=osmzdNESeh7yP3BZday6N9Q3eaIHfzhhRM1d6NSgcf0,11223
10
10
  upgini/metrics.py,sha256=Tu5cN8RlhOSSMWUTXRSkdl8SWBqR1N_2eJpBum9pZxc,30926
11
11
  upgini/search_task.py,sha256=LtRJ9bCPjMo1gJ-sUDKERhDwGcWKImrzwVFHjkMSQHQ,17071
12
12
  upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
@@ -15,19 +15,19 @@ upgini/ads_management/__init__.py,sha256=qzyisOToVRP-tquAJD1PblZhNtMrOB8FiyF9Jvf
15
15
  upgini/ads_management/ads_manager.py,sha256=igVbN2jz80Umb2BUJixmJVj-zx8unoKpecVo-R-nGdw,2648
16
16
  upgini/autofe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
17
  upgini/autofe/all_operands.py,sha256=3LiH9iU-ArGmYpS8FHWH7yCFx40ILfvlSXJlKIa75BQ,2542
18
- upgini/autofe/binary.py,sha256=2Z5FrfdCtesKEHBuabEBiRvwOAzcRoFKAX1wvGpHL0I,7003
19
- upgini/autofe/date.py,sha256=AO3P8GtUHD6vPE_1Vrj3nsnXYBxiXe7vun6aLHReZgQ,9064
20
- upgini/autofe/feature.py,sha256=gwGWY2UcX_0wHAvfEiu1rRU7GFZyzMWZIaPVcf6kD80,14223
21
- upgini/autofe/groupby.py,sha256=4WjDzQxqpZxB79Ih4ihMMI5GDxaFqiH6ZelfV82ClT4,3091
22
- upgini/autofe/operand.py,sha256=MKEsl3zxpWzRDpTkE0sNJxTu62U20sWOvEKhPjUWS6s,2915
23
- upgini/autofe/unary.py,sha256=oIMf-IVy7L7GkzxMmQyExX0tOH9RhWeQh7cGxxMDiPk,3832
24
- upgini/autofe/vector.py,sha256=dLxfAstJs-gw_OQ1xxoxcM6pVzORlV0HVzdzt7cLXVQ,606
18
+ upgini/autofe/binary.py,sha256=xRBT7RNqQ7pprz6cRpO1KnvZCb7PvU3QXBfaP6Omqi4,7425
19
+ upgini/autofe/date.py,sha256=eLPrO2Cgm74VB4rPtIaeUDuI5vmLiGnygHSvU4aGHWU,9223
20
+ upgini/autofe/feature.py,sha256=CivPkE7YrAtDrgF8WhVPnDAnNDR8gbRQ-8_hXiQE6ew,14234
21
+ upgini/autofe/groupby.py,sha256=r-xl_keZZgm_tpiEoDhjYSkT6NHv7a4cRQR4wJ4uCp8,3263
22
+ upgini/autofe/operand.py,sha256=uk883RaNqgXqtkaRqA1re1d9OFnnpv0JVvelYx09Yw0,2943
23
+ upgini/autofe/unary.py,sha256=RiK-Fz3fgjPlqWWfro6x7qChjEZ8W8RTnl5-MT1kaQA,4218
24
+ upgini/autofe/vector.py,sha256=ehcZUDqV71TfbU8EmKfdYp603gS2dJY_-fpr10ho5sI,663
25
25
  upgini/data_source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
26
26
  upgini/data_source/data_source_publisher.py,sha256=Vg0biG86YB0OEaoxbK9YYrr4yARm11_h3bTWIBgoScA,22115
27
27
  upgini/mdc/__init__.py,sha256=aM08nIWFc2gWdWUa3_IuEnNND0cQPkBGnYpRMnfFN8k,1019
28
28
  upgini/mdc/context.py,sha256=3u1B-jXt7tXEvNcV3qmR9SDCseudnY7KYsLclBdwVLk,1405
29
29
  upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
30
- upgini/normalizer/normalize_utils.py,sha256=8gH1oabPNZrC1kHSRFxGGcO0o6yNDlOJXCLzzExq-3s,7451
30
+ upgini/normalizer/normalize_utils.py,sha256=bHRPWCNrUvt2R9qMX6dZFCJ0i8ENVCQ2Rw3dHH9IJEg,7447
31
31
  upgini/resource_bundle/__init__.py,sha256=S5F2G47pnJd2LDpmFsjDqEwiKkP8Hm-hcseDbMka6Ko,8345
32
32
  upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
33
33
  upgini/resource_bundle/strings.properties,sha256=WZAuYPX2Dpn6BHoA3RX8uvMNMr-yJE2fF7Gz0i24x2s,26459
@@ -42,7 +42,7 @@ upgini/utils/blocked_time_series.py,sha256=Uqr3vp4YqNclj2-PzEYqVy763GSXHn86sbpIl
42
42
  upgini/utils/country_utils.py,sha256=lY-eXWwFVegdVENFttbvLcgGDjFO17Sex8hd2PyJaRk,6937
43
43
  upgini/utils/custom_loss_utils.py,sha256=kieNZYBYZm5ZGBltF1F_jOSF4ea6C29rYuCyiDcqVNY,3857
44
44
  upgini/utils/cv_utils.py,sha256=w6FQb9nO8BWDx88EF83NpjPLarK4eR4ia0Wg0kLBJC4,3525
45
- upgini/utils/datetime_utils.py,sha256=niZcf2YqAwokUFUW474zajlzv9HAMf7nv9v_WPJHpyc,12123
45
+ upgini/utils/datetime_utils.py,sha256=4tsGeehU0KS6wqNsc9gEEWZ9s6T9E0UReUIO3rSuXNU,12174
46
46
  upgini/utils/deduplicate_utils.py,sha256=Zvs7zW4QzaERQmJNPrTVf2ZTVBkBLOycFCzyMwtXuV8,8770
47
47
  upgini/utils/display_utils.py,sha256=A2ouB5eiZ-Kyt9ykYxkLQwyoRPrdYeJymwNTiajtFXs,10990
48
48
  upgini/utils/email_utils.py,sha256=j0Ug1R_0AnCg1Y92zIZ4XMwvKo3G5_pcOlBN1OH_gZs,5191
@@ -50,14 +50,14 @@ upgini/utils/fallback_progress_bar.py,sha256=PDaKb8dYpVZaWMroNcOHsTc3pSjgi9mOm0-
50
50
  upgini/utils/features_validator.py,sha256=PgKNt5dyqfErTvjtRNNUS9g7GFqHBtAtnsfA-V5UO1A,3307
51
51
  upgini/utils/format.py,sha256=Yv5cvvSs2bOLUzzNu96Pu33VMDNbabio92QepUj41jU,243
52
52
  upgini/utils/ip_utils.py,sha256=ZZj_uQFTHhagzt-MRew__ZBOp2DdnkMrachS7PElkSE,5143
53
- upgini/utils/phone_utils.py,sha256=PTSRfGAWCuLy8R6I8X6clcc1K7bZXIIrZ_alIB8irC8,10368
53
+ upgini/utils/phone_utils.py,sha256=IrbztLuOJBiePqqxllfABWfYlfAjYevPhXKipl95wUI,10432
54
54
  upgini/utils/postal_code_utils.py,sha256=C899tJS8qM_ps4I3g-Ve6qzIa22O_UqwNmGFoyy9sO8,1716
55
55
  upgini/utils/progress_bar.py,sha256=N-Sfdah2Hg8lXP_fV9EfUTXz_PyRt4lo9fAHoUDOoLc,1550
56
56
  upgini/utils/sklearn_ext.py,sha256=13jQS_k7v0aUtudXV6nGUEWjttPQzAW9AFYL5wgEz9k,44511
57
- upgini/utils/target_utils.py,sha256=Y96_PJ5cC-WsEbeqg20v9uqywDQobLoTb-xoP7S3o4E,7807
57
+ upgini/utils/target_utils.py,sha256=BVtDmrmFMKerSUWaNOIEdzsYHIFiODdpnWbE50QDPDc,7864
58
58
  upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
59
59
  upgini/utils/warning_counter.py,sha256=dIWBB4dI5XRRJZudvIlqlIYKEiwLLPcXarsZuYRt338,227
60
- upgini-1.1.316.dist-info/METADATA,sha256=12UKpdX0d9nky8XWhKtyQjDK2MVWtbsEr811NSWrKmE,48222
61
- upgini-1.1.316.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
62
- upgini-1.1.316.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
63
- upgini-1.1.316.dist-info/RECORD,,
60
+ upgini-1.1.316a2.dist-info/METADATA,sha256=h4ZR7hMkhteC4WNoyFJ7dXVTpUJyphWHa8cKmhP5BEQ,48232
61
+ upgini-1.1.316a2.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
62
+ upgini-1.1.316a2.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
63
+ upgini-1.1.316a2.dist-info/RECORD,,