upgini 1.1.317__py3-none-any.whl → 1.2.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

upgini/__about__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "1.1.317"
1
+ __version__ = "1.2.0a1"
upgini/autofe/binary.py CHANGED
@@ -9,32 +9,32 @@ from upgini.autofe.operand import PandasOperand, VectorizableMixin
9
9
 
10
10
 
11
11
  class Min(PandasOperand):
12
- name = "min"
13
- is_binary = True
14
- is_symmetrical = True
15
- has_symmetry_importance = True
12
+ name: str = "min"
13
+ is_binary: bool = True
14
+ is_symmetrical: bool = True
15
+ has_symmetry_importance: bool = True
16
16
 
17
17
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
18
18
  return np.minimum(left, right)
19
19
 
20
20
 
21
21
  class Max(PandasOperand):
22
- name = "max"
23
- is_binary = True
24
- is_symmetrical = True
25
- has_symmetry_importance = True
22
+ name: str = "max"
23
+ is_binary: bool = True
24
+ is_symmetrical: bool = True
25
+ has_symmetry_importance: bool = True
26
26
 
27
27
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
28
28
  return np.maximum(left, right)
29
29
 
30
30
 
31
31
  class Add(PandasOperand, VectorizableMixin):
32
- name = "+"
33
- alias = "add"
34
- is_binary = True
35
- is_symmetrical = True
36
- has_symmetry_importance = True
37
- is_vectorizable = True
32
+ name: str = "+"
33
+ alias: str = "add"
34
+ is_binary: bool = True
35
+ is_symmetrical: bool = True
36
+ has_symmetry_importance: bool = True
37
+ is_vectorizable: bool = True
38
38
 
39
39
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
40
40
  return left + right
@@ -48,12 +48,12 @@ class Add(PandasOperand, VectorizableMixin):
48
48
 
49
49
 
50
50
  class Subtract(PandasOperand, VectorizableMixin):
51
- name = "-"
52
- alias = "sub"
53
- is_binary = True
54
- is_symmetrical = True
55
- has_symmetry_importance = True
56
- is_vectorizable = True
51
+ name: str = "-"
52
+ alias: str = "sub"
53
+ is_binary: bool = True
54
+ is_symmetrical: bool = True
55
+ has_symmetry_importance: bool = True
56
+ is_vectorizable: bool = True
57
57
 
58
58
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
59
59
  return left - right
@@ -67,12 +67,12 @@ class Subtract(PandasOperand, VectorizableMixin):
67
67
 
68
68
 
69
69
  class Multiply(PandasOperand, VectorizableMixin):
70
- name = "*"
71
- alias = "mul"
72
- is_binary = True
73
- is_symmetrical = True
74
- has_symmetry_importance = True
75
- is_vectorizable = True
70
+ name: str = "*"
71
+ alias: str = "mul"
72
+ is_binary: bool = True
73
+ is_symmetrical: bool = True
74
+ has_symmetry_importance: bool = True
75
+ is_vectorizable: bool = True
76
76
 
77
77
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
78
78
  return left * right
@@ -86,12 +86,12 @@ class Multiply(PandasOperand, VectorizableMixin):
86
86
 
87
87
 
88
88
  class Divide(PandasOperand, VectorizableMixin):
89
- name = "/"
90
- alias = "div"
91
- is_binary = True
92
- has_symmetry_importance = True
93
- is_vectorizable = True
94
- output_type = "float"
89
+ name: str = "/"
90
+ alias: str = "div"
91
+ is_binary: bool = True
92
+ has_symmetry_importance: bool = True
93
+ is_vectorizable: bool = True
94
+ output_type: Optional[str] = "float"
95
95
 
96
96
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
97
97
  return left / right.replace(0, np.nan)
@@ -105,10 +105,10 @@ class Divide(PandasOperand, VectorizableMixin):
105
105
 
106
106
 
107
107
  class Combine(PandasOperand):
108
- name = "Combine"
109
- is_binary = True
110
- has_symmetry_importance = True
111
- output_type = "object"
108
+ name: str = "Combine"
109
+ is_binary: bool = True
110
+ has_symmetry_importance: bool = True
111
+ output_type: Optional[str] = "object"
112
112
 
113
113
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
114
114
  temp = left.astype(str) + "_" + right.astype(str)
@@ -117,13 +117,13 @@ class Combine(PandasOperand):
117
117
 
118
118
 
119
119
  class CombineThenFreq(PandasOperand):
120
- name = "CombineThenFreq"
121
- is_binary = True
122
- is_symmetrical = True
123
- has_symmetry_importance = True
124
- output_type = "float"
125
- is_distribution_dependent = True
126
- input_type = "discrete"
120
+ name: str = "CombineThenFreq"
121
+ is_binary: bool = True
122
+ is_symmetrical: bool = True
123
+ has_symmetry_importance: bool = True
124
+ output_type: Optional[str] = "float"
125
+ is_distribution_dependent: bool = True
126
+ input_type: Optional[str] = "discrete"
127
127
 
128
128
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
129
129
  temp = left.astype(str) + "_" + right.astype(str)
@@ -133,11 +133,11 @@ class CombineThenFreq(PandasOperand):
133
133
 
134
134
 
135
135
  class Distance(PandasOperand):
136
- name = "dist"
137
- is_binary = True
138
- output_type = "float"
139
- is_symmetrical = True
140
- has_symmetry_importance = True
136
+ name: str = "dist"
137
+ is_binary: bool = True
138
+ output_type: Optional[str] = "float"
139
+ is_symmetrical: bool = True
140
+ has_symmetry_importance: bool = True
141
141
 
142
142
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
143
143
  return pd.Series(
@@ -158,11 +158,11 @@ class Distance(PandasOperand):
158
158
 
159
159
  # Left for backward compatibility
160
160
  class Sim(Distance):
161
- name = "sim"
162
- is_binary = True
163
- output_type = "float"
164
- is_symmetrical = True
165
- has_symmetry_importance = True
161
+ name: str = "sim"
162
+ is_binary: bool = True
163
+ output_type: Optional[str] = "float"
164
+ is_symmetrical: bool = True
165
+ has_symmetry_importance: bool = True
166
166
 
167
167
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
168
168
  return 1 - super().calculate_binary(left, right)
@@ -191,12 +191,12 @@ class StringSim(PandasOperand, abc.ABC):
191
191
 
192
192
 
193
193
  class JaroWinklerSim1(StringSim):
194
- name = "sim_jw1"
195
- is_binary = True
196
- input_type = "string"
197
- output_type = "float"
198
- is_symmetrical = True
199
- has_symmetry_importance = True
194
+ name: str = "sim_jw1"
195
+ is_binary: bool = True
196
+ input_type: Optional[str] = "string"
197
+ output_type: Optional[str] = "float"
198
+ is_symmetrical: bool = True
199
+ has_symmetry_importance: bool = True
200
200
 
201
201
  def _prepare_value(self, value: Optional[str]) -> Optional[str]:
202
202
  return value
@@ -206,12 +206,12 @@ class JaroWinklerSim1(StringSim):
206
206
 
207
207
 
208
208
  class JaroWinklerSim2(StringSim):
209
- name = "sim_jw2"
210
- is_binary = True
211
- input_type = "string"
212
- output_type = "float"
213
- is_symmetrical = True
214
- has_symmetry_importance = True
209
+ name: str = "sim_jw2"
210
+ is_binary: bool = True
211
+ input_type: Optional[str] = "string"
212
+ output_type: Optional[str] = "float"
213
+ is_symmetrical: bool = True
214
+ has_symmetry_importance: bool = True
215
215
 
216
216
  def _prepare_value(self, value: Optional[str]) -> Optional[str]:
217
217
  return value[::-1] if value is not None else None
@@ -221,12 +221,12 @@ class JaroWinklerSim2(StringSim):
221
221
 
222
222
 
223
223
  class LevenshteinSim(StringSim):
224
- name = "sim_lv"
225
- is_binary = True
226
- input_type = "string"
227
- output_type = "float"
228
- is_symmetrical = True
229
- has_symmetry_importance = True
224
+ name: str = "sim_lv"
225
+ is_binary: bool = True
226
+ input_type: Optional[str] = "string"
227
+ output_type: Optional[str] = "float"
228
+ is_symmetrical: bool = True
229
+ has_symmetry_importance: bool = True
230
230
 
231
231
  def _prepare_value(self, value: Optional[str]) -> Optional[str]:
232
232
  return value
upgini/autofe/date.py CHANGED
@@ -5,11 +5,16 @@ from typing import Any, Dict, List, Optional, Union
5
5
  import numpy as np
6
6
  import pandas as pd
7
7
  from pandas.core.arrays.timedeltas import TimedeltaArray
8
- from pydantic import BaseModel, validator
8
+ from pydantic import BaseModel, __version__ as pydantic_version
9
9
 
10
10
  from upgini.autofe.operand import PandasOperand
11
11
 
12
12
 
13
+ def get_pydantic_version():
14
+ major_version = int(pydantic_version.split('.')[0])
15
+ return major_version
16
+
17
+
13
18
  class DateDiffMixin(BaseModel):
14
19
  diff_unit: str = "D"
15
20
  left_unit: Optional[str] = None
@@ -39,10 +44,10 @@ class DateDiffMixin(BaseModel):
39
44
 
40
45
 
41
46
  class DateDiff(PandasOperand, DateDiffMixin):
42
- name = "date_diff"
43
- alias = "date_diff_type1"
44
- is_binary = True
45
- has_symmetry_importance = True
47
+ name: str = "date_diff"
48
+ alias: Optional[str] = "date_diff_type1"
49
+ is_binary: bool = True
50
+ has_symmetry_importance: bool = True
46
51
 
47
52
  replace_negative: bool = False
48
53
 
@@ -71,9 +76,9 @@ class DateDiff(PandasOperand, DateDiffMixin):
71
76
 
72
77
 
73
78
  class DateDiffType2(PandasOperand, DateDiffMixin):
74
- name = "date_diff_type2"
75
- is_binary = True
76
- has_symmetry_importance = True
79
+ name: str = "date_diff_type2"
80
+ is_binary: bool = True
81
+ has_symmetry_importance: bool = True
77
82
 
78
83
  def get_params(self) -> Dict[str, Optional[str]]:
79
84
  res = super().get_params()
@@ -105,8 +110,8 @@ _count_aggregations = ["nunique", "count"]
105
110
 
106
111
 
107
112
  class DateListDiff(PandasOperand, DateDiffMixin):
108
- is_binary = True
109
- has_symmetry_importance = True
113
+ is_binary: bool = True
114
+ has_symmetry_importance: bool = True
110
115
 
111
116
  aggregation: str
112
117
  replace_negative: bool = False
@@ -166,8 +171,8 @@ class DateListDiff(PandasOperand, DateDiffMixin):
166
171
 
167
172
 
168
173
  class DateListDiffBounded(DateListDiff):
169
- lower_bound: Optional[int]
170
- upper_bound: Optional[int]
174
+ lower_bound: Optional[int] = None
175
+ upper_bound: Optional[int] = None
171
176
 
172
177
  def __init__(self, **data: Any) -> None:
173
178
  if "name" not in data:
@@ -192,8 +197,8 @@ class DateListDiffBounded(DateListDiff):
192
197
 
193
198
 
194
199
  class DatePercentileBase(PandasOperand, abc.ABC):
195
- is_binary = True
196
- output_type = "float"
200
+ is_binary: bool = True
201
+ output_type: Optional[str] = "float"
197
202
 
198
203
  date_unit: Optional[str] = None
199
204
 
@@ -227,12 +232,12 @@ class DatePercentileBase(PandasOperand, abc.ABC):
227
232
 
228
233
 
229
234
  class DatePercentile(DatePercentileBase):
230
- name = "date_per"
231
- alias = "date_per_method1"
235
+ name: str = "date_per"
236
+ alias: Optional[str] = "date_per_method1"
232
237
 
233
- zero_month: Optional[int]
234
- zero_year: Optional[int]
235
- zero_bounds: Optional[List[float]]
238
+ zero_month: Optional[int] = None
239
+ zero_year: Optional[int] = None
240
+ zero_bounds: Optional[List[float]] = None
236
241
  step: int = 30
237
242
 
238
243
  def get_params(self) -> Dict[str, Optional[str]]:
@@ -247,12 +252,25 @@ class DatePercentile(DatePercentileBase):
247
252
  )
248
253
  return res
249
254
 
250
- @validator("zero_bounds", pre=True)
251
- def validate_bounds(cls, value):
252
- if value is None or isinstance(value, list):
255
+ # Check Pydantic version
256
+ if get_pydantic_version() >= 2:
257
+ # Use @field_validator for Pydantic 2.x
258
+ from pydantic import field_validator
259
+
260
+ @field_validator('zero_bounds', mode='before')
261
+ def parse_zero_bounds(cls, value):
262
+ if isinstance(value, str):
263
+ return json.loads(value)
264
+ return value
265
+ else:
266
+ # Use @validator for Pydantic 1.x
267
+ from pydantic import validator
268
+
269
+ @validator('zero_bounds', pre=True)
270
+ def parse_zero_bounds(cls, value):
271
+ if isinstance(value, str):
272
+ return json.loads(value)
253
273
  return value
254
- elif isinstance(value, str):
255
- return json.loads(value)
256
274
 
257
275
  def _get_bounds(self, date_col: pd.Series) -> pd.Series:
258
276
  months = date_col.dt.month
@@ -265,7 +283,7 @@ class DatePercentile(DatePercentileBase):
265
283
 
266
284
 
267
285
  class DatePercentileMethod2(DatePercentileBase):
268
- name = "date_per_method2"
286
+ name: str = "date_per_method2"
269
287
 
270
288
  def _get_bounds(self, date_col: pd.Series) -> pd.Series:
271
289
  pass
upgini/autofe/groupby.py CHANGED
@@ -7,9 +7,9 @@ from upgini.autofe.operand import PandasOperand, VectorizableMixin
7
7
 
8
8
  class GroupByThenAgg(PandasOperand, VectorizableMixin):
9
9
  agg: Optional[str]
10
- is_vectorizable = True
11
- is_grouping = True
12
- is_distribution_dependent = True
10
+ is_vectorizable: bool = True
11
+ is_grouping: bool = True
12
+ is_distribution_dependent: bool = True
13
13
 
14
14
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
15
15
  temp = left.groupby(right).agg(self.agg)
@@ -24,17 +24,17 @@ class GroupByThenAgg(PandasOperand, VectorizableMixin):
24
24
 
25
25
 
26
26
  class GroupByThenMedian(GroupByThenAgg):
27
- name = "GroupByThenMedian"
28
- pandas_agg = "median"
29
- is_distribution_dependent = True
27
+ name: str = "GroupByThenMedian"
28
+ pandas_agg: str = "median"
29
+ is_distribution_dependent: bool = True
30
30
 
31
31
 
32
32
  class GroupByThenRank(PandasOperand, VectorizableMixin):
33
- name = "GroupByThenRank"
34
- is_vectorizable = True
35
- is_grouping = True
36
- output_type = "float"
37
- is_distribution_dependent = True
33
+ name: str = "GroupByThenRank"
34
+ is_vectorizable: bool = True
35
+ is_grouping: bool = True
36
+ output_type: Optional[str] = "float"
37
+ is_distribution_dependent: bool = True
38
38
 
39
39
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
40
40
  temp = pd.DataFrame(left[~right.isna()].groupby(right).rank(ascending=True, pct=True)).reset_index()
@@ -49,12 +49,12 @@ class GroupByThenRank(PandasOperand, VectorizableMixin):
49
49
 
50
50
 
51
51
  class GroupByThenNUnique(PandasOperand, VectorizableMixin):
52
- name = "GroupByThenNUnique"
53
- is_vectorizable = True
54
- is_grouping = True
55
- output_type = "int"
56
- is_distribution_dependent = True
57
- input_type = "discrete"
52
+ name: str = "GroupByThenNUnique"
53
+ is_vectorizable: bool = True
54
+ is_grouping: bool = True
55
+ output_type: Optional[str] = "int"
56
+ is_distribution_dependent: bool = True
57
+ input_type: Optional[str] = "discrete"
58
58
 
59
59
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
60
60
  nunique = left.groupby(right).nunique()
@@ -69,11 +69,11 @@ class GroupByThenNUnique(PandasOperand, VectorizableMixin):
69
69
 
70
70
 
71
71
  class GroupByThenFreq(PandasOperand):
72
- name = "GroupByThenFreq"
73
- is_grouping = True
74
- output_type = "float"
75
- is_distribution_dependent = True
76
- input_type = "discrete"
72
+ name: str = "GroupByThenFreq"
73
+ is_grouping: bool = True
74
+ output_type: Optional[str] = "float"
75
+ is_distribution_dependent: bool = True
76
+ input_type: Optional[str] = "discrete"
77
77
 
78
78
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
79
79
  def _f(x):
upgini/autofe/operand.py CHANGED
@@ -8,19 +8,19 @@ from pydantic import BaseModel
8
8
 
9
9
  class Operand(BaseModel):
10
10
  name: str
11
- alias: Optional[str]
11
+ alias: Optional[str] = None
12
12
  is_unary: bool = False
13
13
  is_symmetrical: bool = False
14
14
  has_symmetry_importance: bool = False
15
- input_type: Optional[str]
16
- output_type: Optional[str]
15
+ input_type: Optional[str] = None
16
+ output_type: Optional[str] = None
17
17
  is_categorical: bool = False
18
18
  is_vectorizable: bool = False
19
19
  is_grouping: bool = False
20
20
  is_binary: bool = False
21
21
  is_vector: bool = False
22
22
  is_distribution_dependent: bool = False
23
- params: Optional[Dict[str, str]]
23
+ params: Optional[Dict[str, str]] = None
24
24
 
25
25
  def set_params(self, params: Dict[str, str]):
26
26
  self.params = params
upgini/autofe/unary.py CHANGED
@@ -1,3 +1,4 @@
1
+ from typing import Optional
1
2
  import numpy as np
2
3
  import pandas as pd
3
4
  from sklearn.preprocessing import Normalizer
@@ -6,10 +7,10 @@ from upgini.autofe.operand import PandasOperand, VectorizableMixin
6
7
 
7
8
 
8
9
  class Abs(PandasOperand, VectorizableMixin):
9
- name = "abs"
10
- is_unary = True
11
- is_vectorizable = True
12
- group_index = 0
10
+ name: str = "abs"
11
+ is_unary: bool = True
12
+ is_vectorizable: bool = True
13
+ group_index: int = 0
13
14
 
14
15
  def calculate_unary(self, data: pd.Series) -> pd.Series:
15
16
  return data.abs()
@@ -19,11 +20,11 @@ class Abs(PandasOperand, VectorizableMixin):
19
20
 
20
21
 
21
22
  class Log(PandasOperand, VectorizableMixin):
22
- name = "log"
23
- is_unary = True
24
- is_vectorizable = True
25
- output_type = "float"
26
- group_index = 0
23
+ name: str = "log"
24
+ is_unary: bool = True
25
+ is_vectorizable: bool = True
26
+ output_type: Optional[str] = "float"
27
+ group_index: int = 0
27
28
 
28
29
  def calculate_unary(self, data: pd.Series) -> pd.Series:
29
30
  return self._round_value(np.log(np.abs(data.replace(0, np.nan))), 10)
@@ -33,11 +34,11 @@ class Log(PandasOperand, VectorizableMixin):
33
34
 
34
35
 
35
36
  class Sqrt(PandasOperand, VectorizableMixin):
36
- name = "sqrt"
37
- is_unary = True
38
- is_vectorizable = True
39
- output_type = "float"
40
- group_index = 0
37
+ name: str = "sqrt"
38
+ is_unary: bool = True
39
+ is_vectorizable: bool = True
40
+ output_type: Optional[str] = "float"
41
+ group_index: int = 0
41
42
 
42
43
  def calculate_unary(self, data: pd.Series) -> pd.Series:
43
44
  return self._round_value(np.sqrt(np.abs(data)))
@@ -47,10 +48,10 @@ class Sqrt(PandasOperand, VectorizableMixin):
47
48
 
48
49
 
49
50
  class Square(PandasOperand, VectorizableMixin):
50
- name = "square"
51
- is_unary = True
52
- is_vectorizable = True
53
- group_index = 0
51
+ name: str = "square"
52
+ is_unary: bool = True
53
+ is_vectorizable: bool = True
54
+ group_index: int = 0
54
55
 
55
56
  def calculate_unary(self, data: pd.Series) -> pd.Series:
56
57
  return np.square(data)
@@ -60,11 +61,11 @@ class Square(PandasOperand, VectorizableMixin):
60
61
 
61
62
 
62
63
  class Sigmoid(PandasOperand, VectorizableMixin):
63
- name = "sigmoid"
64
- is_unary = True
65
- is_vectorizable = True
66
- output_type = "float"
67
- group_index = 0
64
+ name: str = "sigmoid"
65
+ is_unary: bool = True
66
+ is_vectorizable: bool = True
67
+ output_type: Optional[str] = "float"
68
+ group_index: int = 0
68
69
 
69
70
  def calculate_unary(self, data: pd.Series) -> pd.Series:
70
71
  return self._round_value(1 / (1 + np.exp(-data)))
@@ -74,12 +75,12 @@ class Sigmoid(PandasOperand, VectorizableMixin):
74
75
 
75
76
 
76
77
  class Floor(PandasOperand, VectorizableMixin):
77
- name = "floor"
78
- is_unary = True
79
- is_vectorizable = True
80
- output_type = "int"
81
- input_type = "continuous"
82
- group_index = 0
78
+ name: str = "floor"
79
+ is_unary: bool = True
80
+ is_vectorizable: bool = True
81
+ output_type: Optional[str] = "int"
82
+ input_type: Optional[str] = "continuous"
83
+ group_index: int = 0
83
84
 
84
85
  def calculate_unary(self, data: pd.Series) -> pd.Series:
85
86
  return np.floor(data)
@@ -89,11 +90,11 @@ class Floor(PandasOperand, VectorizableMixin):
89
90
 
90
91
 
91
92
  class Residual(PandasOperand, VectorizableMixin):
92
- name = "residual"
93
- is_unary = True
94
- is_vectorizable = True
95
- input_type = "continuous"
96
- group_index = 0
93
+ name: str = "residual"
94
+ is_unary: bool = True
95
+ is_vectorizable: bool = True
96
+ input_type: Optional[str] = "continuous"
97
+ group_index: int = 0
97
98
 
98
99
  def calculate_unary(self, data: pd.Series) -> pd.Series:
99
100
  return data - np.floor(data)
@@ -103,11 +104,11 @@ class Residual(PandasOperand, VectorizableMixin):
103
104
 
104
105
 
105
106
  class Freq(PandasOperand):
106
- name = "freq"
107
- is_unary = True
108
- output_type = "float"
109
- is_distribution_dependent = True
110
- input_type = "discrete"
107
+ name: str = "freq"
108
+ is_unary: bool = True
109
+ output_type: Optional[str] = "float"
110
+ is_distribution_dependent: bool = True
111
+ input_type: Optional[str] = "discrete"
111
112
 
112
113
  def calculate_unary(self, data: pd.Series) -> pd.Series:
113
114
  value_counts = data.value_counts(normalize=True)
@@ -115,9 +116,9 @@ class Freq(PandasOperand):
115
116
 
116
117
 
117
118
  class Norm(PandasOperand):
118
- name = "norm"
119
- is_unary = True
120
- output_type = "float"
119
+ name: str = "norm"
120
+ is_unary: bool = True
121
+ output_type: Optional[str] = "float"
121
122
 
122
123
  def calculate_unary(self, data: pd.Series) -> pd.Series:
123
124
  data_dropna = data.dropna()
@@ -131,7 +132,7 @@ class Norm(PandasOperand):
131
132
 
132
133
 
133
134
  class Embeddings(PandasOperand):
134
- name = "emb"
135
- is_unary = True
136
- input_type = "string"
137
- output_type = "vector"
135
+ name: str = "emb"
136
+ is_unary: bool = True
137
+ input_type: Optional[str] = "string"
138
+ output_type: Optional[str] = "vector"
upgini/autofe/vector.py CHANGED
@@ -1,4 +1,4 @@
1
- from typing import List
1
+ from typing import List, Optional
2
2
 
3
3
  import pandas as pd
4
4
 
@@ -6,19 +6,19 @@ from upgini.autofe.operand import PandasOperand, VectorizableMixin
6
6
 
7
7
 
8
8
  class Mean(PandasOperand, VectorizableMixin):
9
- name = "mean"
10
- output_type = "float"
11
- is_vector = True
12
- group_index = 0
9
+ name: str = "mean"
10
+ output_type: Optional[str] = "float"
11
+ is_vector: bool = True
12
+ group_index: int = 0
13
13
 
14
14
  def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
15
15
  return pd.DataFrame(data).T.fillna(0).mean(axis=1)
16
16
 
17
17
 
18
18
  class Sum(PandasOperand, VectorizableMixin):
19
- name = "sum"
20
- is_vector = True
21
- group_index = 0
19
+ name: str = "sum"
20
+ is_vector: bool = True
21
+ group_index: int = 0
22
22
 
23
23
  def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
24
24
  return pd.DataFrame(data).T.fillna(0).sum(axis=1)
upgini/dataset.py CHANGED
@@ -18,6 +18,7 @@ from pandas.api.types import (
18
18
  from upgini.errors import ValidationError
19
19
  from upgini.http import ProgressStage, SearchProgress, _RestClient
20
20
  from upgini.metadata import (
21
+ ENTITY_SYSTEM_RECORD_ID,
21
22
  EVAL_SET_INDEX,
22
23
  SYSTEM_RECORD_ID,
23
24
  TARGET,
@@ -157,7 +158,11 @@ class Dataset: # (pd.DataFrame):
157
158
  raise ValidationError(self.bundle.get("dataset_too_few_rows").format(self.MIN_ROWS_COUNT))
158
159
 
159
160
  def __validate_max_row_count(self):
160
- if len(self.data) > self.MAX_ROWS:
161
+ if ENTITY_SYSTEM_RECORD_ID in self.data.columns:
162
+ rows_count = self.data[ENTITY_SYSTEM_RECORD_ID].nunique()
163
+ else:
164
+ rows_count = len(self.data)
165
+ if rows_count > self.MAX_ROWS:
161
166
  raise ValidationError(self.bundle.get("dataset_too_many_rows_registered").format(self.MAX_ROWS))
162
167
 
163
168
  def __target_value(self) -> pd.Series:
@@ -199,14 +204,14 @@ class Dataset: # (pd.DataFrame):
199
204
  elif self.task_type == ModelTaskType.REGRESSION:
200
205
  if not is_float_dtype(target):
201
206
  try:
202
- self.data[target_column] = self.data[target_column].astype("float")
207
+ self.data[target_column] = self.data[target_column].astype("float64")
203
208
  except ValueError:
204
209
  self.logger.exception("Failed to cast target to float for regression task type")
205
210
  raise ValidationError(self.bundle.get("dataset_invalid_regression_target").format(target.dtype))
206
211
  elif self.task_type == ModelTaskType.TIMESERIES:
207
212
  if not is_float_dtype(target):
208
213
  try:
209
- self.data[target_column] = self.data[target_column].astype("float")
214
+ self.data[target_column] = self.data[target_column].astype("float64")
210
215
  except ValueError:
211
216
  self.logger.exception("Failed to cast target to float for timeseries task type")
212
217
  raise ValidationError(self.bundle.get("dataset_invalid_timeseries_target").format(target.dtype))
@@ -23,7 +23,6 @@ from pandas.api.types import (
23
23
  is_datetime64_any_dtype,
24
24
  is_numeric_dtype,
25
25
  is_object_dtype,
26
- is_period_dtype,
27
26
  is_string_dtype,
28
27
  )
29
28
  from scipy.stats import ks_2samp
@@ -1408,7 +1407,9 @@ class FeaturesEnricher(TransformerMixin):
1408
1407
  # TODO maybe there is no more need for these convertions
1409
1408
  # Remove datetime features
1410
1409
  datetime_features = [
1411
- f for f in fitting_X.columns if is_datetime64_any_dtype(fitting_X[f]) or is_period_dtype(fitting_X[f])
1410
+ f
1411
+ for f in fitting_X.columns
1412
+ if is_datetime64_any_dtype(fitting_X[f]) or isinstance(fitting_X[f].dtype, pd.PeriodDtype)
1412
1413
  ]
1413
1414
  if len(datetime_features) > 0:
1414
1415
  self.logger.warning(self.bundle.get("dataset_date_features").format(datetime_features))
@@ -2041,7 +2042,7 @@ class FeaturesEnricher(TransformerMixin):
2041
2042
 
2042
2043
  df[ENTITY_SYSTEM_RECORD_ID] = pd.util.hash_pandas_object(
2043
2044
  df[columns_for_system_record_id], index=False
2044
- ).astype("Float64")
2045
+ ).astype("float64")
2045
2046
 
2046
2047
  # Explode multiple search keys
2047
2048
  df, unnest_search_keys = self._explode_multiple_search_keys(df, search_keys, columns_renaming)
@@ -2107,7 +2108,7 @@ class FeaturesEnricher(TransformerMixin):
2107
2108
  # search keys might be changed after explode
2108
2109
  columns_for_system_record_id = sorted(list(search_keys.keys()) + features_for_transform)
2109
2110
  df[SYSTEM_RECORD_ID] = pd.util.hash_pandas_object(df[columns_for_system_record_id], index=False).astype(
2110
- "Float64"
2111
+ "float64"
2111
2112
  )
2112
2113
  meaning_types[SYSTEM_RECORD_ID] = FileColumnMeaningType.SYSTEM_RECORD_ID
2113
2114
  meaning_types[ENTITY_SYSTEM_RECORD_ID] = FileColumnMeaningType.ENTITY_SYSTEM_RECORD_ID
@@ -2667,6 +2668,7 @@ class FeaturesEnricher(TransformerMixin):
2667
2668
 
2668
2669
  autofe_description = self.get_autofe_features_description()
2669
2670
  if autofe_description is not None:
2671
+ self.logger.info(f"AutoFE descriptions: {autofe_description}")
2670
2672
  display_html_dataframe(autofe_description, autofe_description, "*Description of AutoFE feature names")
2671
2673
 
2672
2674
  if self._has_paid_features(exclude_features_sources):
upgini/http.py CHANGED
@@ -39,18 +39,6 @@ from upgini.metadata import (
39
39
  from upgini.resource_bundle import bundle
40
40
  from upgini.utils.track_info import get_track_metrics
41
41
 
42
- # try:
43
- # from importlib.metadata import version # type: ignore
44
-
45
- # __version__ = version("upgini")
46
- # except ImportError:
47
- # try:
48
- # from importlib_metadata import version # type: ignore
49
-
50
- # __version__ = version("upgini")
51
- # except ImportError:
52
- # __version__ = "Upgini wasn't installed"
53
-
54
42
  UPGINI_URL: str = "UPGINI_URL"
55
43
  UPGINI_API_KEY: str = "UPGINI_API_KEY"
56
44
  DEMO_API_KEY: str = "Aa4BPwGFbn1zNEXIkZ-NbhsRk0ricN6puKuga1-O5lM"
@@ -471,7 +459,11 @@ class _RestClient:
471
459
  dumps(track_metrics).encode(),
472
460
  "application/json",
473
461
  ),
474
- "metrics": ("metrics.json", metrics.json(exclude_none=True).encode(), "application/json"),
462
+ "metrics": (
463
+ "metrics.json",
464
+ metrics.json(exclude_none=True).encode(),
465
+ "application/json",
466
+ ),
475
467
  "file": (metadata_with_md5.name, file, "application/octet-stream"),
476
468
  }
477
469
  if search_customization is not None:
@@ -555,7 +547,11 @@ class _RestClient:
555
547
  dumps(get_track_metrics(self.client_ip, self.client_visitorid)).encode(),
556
548
  "application/json",
557
549
  ),
558
- "metrics": ("metrics.json", metrics.json(exclude_none=True).encode(), "application/json"),
550
+ "metrics": (
551
+ "metrics.json",
552
+ metrics.json(exclude_none=True).encode(),
553
+ "application/json",
554
+ ),
559
555
  "file": (metadata_with_md5.name, file, "application/octet-stream"),
560
556
  }
561
557
  if search_customization is not None:
@@ -651,7 +647,11 @@ class _RestClient:
651
647
  with open(file_path, "rb") as file:
652
648
  files = {
653
649
  "file": (metadata.name, file, "application/octet-stream"),
654
- "metadata": ("metadata.json", metadata.json(exclude_none=True).encode(), "application/json"),
650
+ "metadata": (
651
+ "metadata.json",
652
+ metadata.json(exclude_none=True).encode(),
653
+ "application/json",
654
+ ),
655
655
  }
656
656
 
657
657
  return self._send_post_file_req_v2(api_path, files)
upgini/lazy_import.py CHANGED
@@ -1,4 +1,6 @@
1
1
  import importlib
2
+ import importlib.util
3
+ import importlib.machinery
2
4
 
3
5
 
4
6
  class LazyImport:
@@ -10,7 +12,18 @@ class LazyImport:
10
12
 
11
13
  def _load(self):
12
14
  if self._module is None:
13
- self._module = importlib.import_module(self.module_name)
15
+ # Load module and save link to it
16
+ spec = importlib.util.find_spec(self.module_name)
17
+ if spec is None:
18
+ raise ImportError(f"Module {self.module_name} not found")
19
+
20
+ # Create module
21
+ self._module = importlib.util.module_from_spec(spec)
22
+
23
+ # Execute module
24
+ spec.loader.exec_module(self._module)
25
+
26
+ # Get class from module
14
27
  self._class = getattr(self._module, self.class_name)
15
28
 
16
29
  def __call__(self, *args, **kwargs):
upgini/metadata.py CHANGED
@@ -1,7 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  from enum import Enum
4
- from typing import Dict, List, Optional, Set, Union
4
+ from typing import Any, Dict, List, Optional, Set, Union
5
5
 
6
6
  from pydantic import BaseModel
7
7
 
@@ -172,23 +172,23 @@ class FileMetricsInterval(BaseModel):
172
172
  date_cut: float
173
173
  count: float
174
174
  valid_count: float
175
- avg_target: Optional[float] # not for multiclass
176
- avg_score_etalon: Optional[float]
175
+ avg_target: Optional[float] = None # not for multiclass
176
+ avg_score_etalon: Optional[float] = None
177
177
 
178
178
 
179
179
  class FileMetrics(BaseModel):
180
180
  # etalon metadata
181
- task_type: Optional[ModelTaskType]
182
- label: Optional[ModelLabelType]
183
- count: Optional[int]
184
- valid_count: Optional[int]
185
- valid_rate: Optional[float]
186
- avg_target: Optional[float]
187
- metrics_binary_etalon: Optional[BinaryTask]
188
- metrics_regression_etalon: Optional[RegressionTask]
189
- metrics_multiclass_etalon: Optional[MulticlassTask]
190
- cuts: Optional[List[float]]
191
- interval: Optional[List[FileMetricsInterval]]
181
+ task_type: Optional[ModelTaskType] = None
182
+ label: Optional[ModelLabelType] = None
183
+ count: Optional[int] = None
184
+ valid_count: Optional[int] = None
185
+ valid_rate: Optional[float] = None
186
+ avg_target: Optional[float] = None
187
+ metrics_binary_etalon: Optional[BinaryTask] = None
188
+ metrics_regression_etalon: Optional[RegressionTask] = None
189
+ metrics_multiclass_etalon: Optional[MulticlassTask] = None
190
+ cuts: Optional[List[float]] = None
191
+ interval: Optional[List[FileMetricsInterval]] = None
192
192
 
193
193
 
194
194
  class NumericInterval(BaseModel):
@@ -202,25 +202,25 @@ class FileColumnMetadata(BaseModel):
202
202
  dataType: DataType
203
203
  meaningType: FileColumnMeaningType
204
204
  minMaxValues: Optional[NumericInterval] = None
205
- originalName: Optional[str]
205
+ originalName: Optional[str] = None
206
206
  # is this column contains keys from multiple key columns like msisdn1, msisdn2
207
207
  isUnnest: bool = False
208
208
  # list of original etalon key column names like msisdn1, msisdn2
209
- unnestKeyNames: Optional[List[str]]
209
+ unnestKeyNames: Optional[List[str]] = None
210
210
 
211
211
 
212
212
  class FileMetadata(BaseModel):
213
213
  name: str
214
- description: Optional[str]
214
+ description: Optional[str] = None
215
215
  columns: List[FileColumnMetadata]
216
216
  searchKeys: List[List[str]]
217
- excludeFeaturesSources: Optional[List[str]]
218
- hierarchicalGroupKeys: Optional[List[str]]
219
- hierarchicalSubgroupKeys: Optional[List[str]]
220
- taskType: Optional[ModelTaskType]
221
- rowsCount: Optional[int]
222
- checksumMD5: Optional[str]
223
- digest: Optional[str]
217
+ excludeFeaturesSources: Optional[List[str]] = None
218
+ hierarchicalGroupKeys: Optional[List[str]] = None
219
+ hierarchicalSubgroupKeys: Optional[List[str]] = None
220
+ taskType: Optional[ModelTaskType] = None
221
+ rowsCount: Optional[int] = None
222
+ checksumMD5: Optional[str] = None
223
+ digest: Optional[str] = None
224
224
 
225
225
  def column_by_name(self, name: str) -> Optional[FileColumnMetadata]:
226
226
  for c in self.columns:
@@ -244,17 +244,17 @@ class FeaturesMetadataV2(BaseModel):
244
244
  source: str
245
245
  hit_rate: float
246
246
  shap_value: float
247
- commercial_schema: Optional[str]
248
- data_provider: Optional[str]
249
- data_providers: Optional[List[str]]
250
- data_provider_link: Optional[str]
251
- data_provider_links: Optional[List[str]]
252
- data_source: Optional[str]
253
- data_sources: Optional[List[str]]
254
- data_source_link: Optional[str]
255
- data_source_links: Optional[List[str]]
256
- doc_link: Optional[str]
257
- update_frequency: Optional[str]
247
+ commercial_schema: Optional[str] = None
248
+ data_provider: Optional[str] = None
249
+ data_providers: Optional[List[str]] = None
250
+ data_provider_link: Optional[str] = None
251
+ data_provider_links: Optional[List[str]] = None
252
+ data_source: Optional[str] = None
253
+ data_sources: Optional[List[str]] = None
254
+ data_source_link: Optional[str] = None
255
+ data_source_links: Optional[List[str]] = None
256
+ doc_link: Optional[str] = None
257
+ update_frequency: Optional[str] = None
258
258
 
259
259
 
260
260
  class HitRateMetrics(BaseModel):
@@ -274,48 +274,48 @@ class ModelEvalSet(BaseModel):
274
274
  class BaseColumnMetadata(BaseModel):
275
275
  original_name: str
276
276
  hashed_name: str
277
- ads_definition_id: Optional[str]
277
+ ads_definition_id: Optional[str] = None
278
278
  is_augmented: bool
279
279
 
280
280
 
281
281
  class GeneratedFeatureMetadata(BaseModel):
282
- alias: Optional[str]
282
+ alias: Optional[str] = None
283
283
  formula: str
284
284
  display_index: str
285
285
  base_columns: List[BaseColumnMetadata]
286
- operator_params: Optional[Dict[str, str]]
286
+ operator_params: Optional[Dict[str, str]] = None
287
287
 
288
288
 
289
289
  class ProviderTaskMetadataV2(BaseModel):
290
290
  features: List[FeaturesMetadataV2]
291
- hit_rate_metrics: Optional[HitRateMetrics]
292
- eval_set_metrics: Optional[List[ModelEvalSet]]
293
- zero_hit_rate_search_keys: Optional[List[str]]
294
- features_used_for_embeddings: Optional[List[str]]
295
- shuffle_kfold: Optional[bool]
296
- generated_features: Optional[List[GeneratedFeatureMetadata]]
291
+ hit_rate_metrics: Optional[HitRateMetrics] = None
292
+ eval_set_metrics: Optional[List[ModelEvalSet]] = None
293
+ zero_hit_rate_search_keys: Optional[List[str]] = None
294
+ features_used_for_embeddings: Optional[List[str]] = None
295
+ shuffle_kfold: Optional[bool] = None
296
+ generated_features: Optional[List[GeneratedFeatureMetadata]] = None
297
297
 
298
298
 
299
299
  class FeaturesFilter(BaseModel):
300
- minImportance: Optional[float]
301
- maxPSI: Optional[float]
302
- maxCount: Optional[int]
303
- selectedFeatures: Optional[List[str]]
300
+ minImportance: Optional[float] = None
301
+ maxPSI: Optional[float] = None
302
+ maxCount: Optional[int] = None
303
+ selectedFeatures: Optional[List[str]] = None
304
304
 
305
305
 
306
306
  class RuntimeParameters(BaseModel):
307
- properties: Dict[str, str] = {}
307
+ properties: Dict[str, Any] = {}
308
308
 
309
309
 
310
310
  class SearchCustomization(BaseModel):
311
- featuresFilter: Optional[FeaturesFilter]
312
- extractFeatures: Optional[bool]
313
- accurateModel: Optional[bool]
314
- importanceThreshold: Optional[float]
315
- maxFeatures: Optional[int]
316
- returnScores: Optional[bool]
317
- runtimeParameters: Optional[RuntimeParameters]
318
- metricsCalculation: Optional[bool]
311
+ featuresFilter: Optional[FeaturesFilter] = None
312
+ extractFeatures: Optional[bool] = None
313
+ accurateModel: Optional[bool] = None
314
+ importanceThreshold: Optional[float] = None
315
+ maxFeatures: Optional[int] = None
316
+ returnScores: Optional[bool] = None
317
+ runtimeParameters: Optional[RuntimeParameters] = None
318
+ metricsCalculation: Optional[bool] = None
319
319
 
320
320
  def __repr__(self):
321
321
  return (
@@ -10,7 +10,6 @@ from pandas.api.types import (
10
10
  is_float_dtype,
11
11
  is_numeric_dtype,
12
12
  is_object_dtype,
13
- is_period_dtype,
14
13
  is_string_dtype,
15
14
  )
16
15
 
@@ -135,7 +134,7 @@ class Normalizer:
135
134
 
136
135
  removed_features = []
137
136
  for f in features:
138
- if is_datetime(df[f]) or is_period_dtype(df[f]):
137
+ if is_datetime(df[f]) or isinstance(df[f].dtype, pd.PeriodDtype):
139
138
  removed_features.append(f)
140
139
  df.drop(columns=f, inplace=True)
141
140
 
upgini/search_task.py CHANGED
@@ -3,6 +3,7 @@ import tempfile
3
3
  import time
4
4
  from functools import lru_cache
5
5
  from typing import Dict, List, Optional
6
+ import uuid
6
7
 
7
8
  import pandas as pd
8
9
 
@@ -97,10 +98,7 @@ class SearchTask:
97
98
  time.sleep(self.POLLING_DELAY_SECONDS)
98
99
  except KeyboardInterrupt as e:
99
100
  if not check_fit:
100
- print(bundle.get("search_stopping"))
101
- self.rest_client.stop_search_task_v2(trace_id, search_task_id)
102
- self.logger.warning(f"Search {search_task_id} stopped by user")
103
- print(bundle.get("search_stopped"))
101
+ self._stop(trace_id)
104
102
  raise e
105
103
  print()
106
104
 
@@ -133,6 +131,14 @@ class SearchTask:
133
131
 
134
132
  return self
135
133
 
134
+ def _stop(self, trace_id: Optional[str] = None):
135
+ trace_id = trace_id or uuid.uuid4()
136
+ search_task_id = self.initial_search_task_id if self.initial_search_task_id is not None else self.search_task_id
137
+ print(bundle.get("search_stopping"))
138
+ self.rest_client.stop_search_task_v2(trace_id, search_task_id)
139
+ self.logger.warning(f"Search {search_task_id} stopped by user")
140
+ print(bundle.get("search_stopped"))
141
+
136
142
  def get_all_features_metadata_v2(self) -> Optional[List[FeaturesMetadataV2]]:
137
143
  if self.provider_metadata_v2 is None:
138
144
  return None
@@ -6,7 +6,7 @@ from typing import Dict, List, Optional
6
6
  import numpy as np
7
7
  import pandas as pd
8
8
  from dateutil.relativedelta import relativedelta
9
- from pandas.api.types import is_numeric_dtype, is_period_dtype
9
+ from pandas.api.types import is_numeric_dtype
10
10
 
11
11
  from upgini.errors import ValidationError
12
12
  from upgini.metadata import EVAL_SET_INDEX, SearchKey
@@ -84,7 +84,7 @@ class DateTimeSearchKeyConverter:
84
84
  df[self.date_column] = df[self.date_column].apply(lambda x: x.replace(tzinfo=None))
85
85
  elif isinstance(df[self.date_column].values[0], datetime.date):
86
86
  df[self.date_column] = pd.to_datetime(df[self.date_column], errors="coerce")
87
- elif is_period_dtype(df[self.date_column]):
87
+ elif isinstance(df[self.date_column].dtype, pd.PeriodDtype):
88
88
  df[self.date_column] = df[self.date_column].dt.to_timestamp()
89
89
  elif is_numeric_dtype(df[self.date_column]):
90
90
  # 315532801 - 2524608001 - seconds
@@ -207,7 +207,7 @@ def is_time_series(df: pd.DataFrame, date_col: str) -> bool:
207
207
  def is_blocked_time_series(df: pd.DataFrame, date_col: str, search_keys: List[str]) -> bool:
208
208
  df = df.copy()
209
209
  seconds = "datetime_seconds"
210
- if is_period_dtype(df[date_col]):
210
+ if isinstance(df[date_col].dtype, pd.PeriodDtype):
211
211
  df[date_col] = df[date_col].dt.to_timestamp()
212
212
  else:
213
213
  df[date_col] = pd.to_datetime(df[date_col])
@@ -275,7 +275,7 @@ def validate_dates_distribution(
275
275
  if col in search_keys:
276
276
  continue
277
277
  try:
278
- if is_period_dtype(X[col]):
278
+ if isinstance(X[col].dtype, pd.PeriodDtype):
279
279
  pass
280
280
  elif pd.__version__ >= "2.0.0":
281
281
  # Format mixed to avoid massive warnings
@@ -290,7 +290,7 @@ def validate_dates_distribution(
290
290
  if maybe_date_col is None:
291
291
  return
292
292
 
293
- if is_period_dtype(X[maybe_date_col]):
293
+ if isinstance(X[maybe_date_col].dtype, pd.PeriodDtype):
294
294
  dates = X[maybe_date_col].dt.to_timestamp().dt.date
295
295
  elif pd.__version__ >= "2.0.0":
296
296
  dates = pd.to_datetime(X[maybe_date_col], format="mixed").dt.date
@@ -1,12 +1,8 @@
1
1
  from typing import Optional
2
2
 
3
+ import numpy as np
3
4
  import pandas as pd
4
- from pandas.api.types import (
5
- is_float_dtype,
6
- is_int64_dtype,
7
- is_object_dtype,
8
- is_string_dtype,
9
- )
5
+ from pandas.api.types import is_float_dtype, is_object_dtype, is_string_dtype
10
6
 
11
7
  from upgini.errors import ValidationError
12
8
  from upgini.utils.base_search_key_detector import BaseSearchKeyDetector
@@ -63,7 +59,9 @@ class PhoneSearchKeyConverter:
63
59
  convert_func = self.phone_str_to_int_safe
64
60
  elif is_float_dtype(df[self.phone_column]):
65
61
  convert_func = self.phone_float_to_int_safe
66
- elif is_int64_dtype(df[self.phone_column]):
62
+ elif df[self.phone_column].dtype == np.int64 or isinstance(
63
+ df[self.phone_column].dtype, pd.Int64Dtype
64
+ ):
67
65
  convert_func = self.phone_int_to_int_safe
68
66
  else:
69
67
  raise ValidationError(
@@ -25,7 +25,7 @@ class PostalCodeSearchKeyConverter:
25
25
  if is_string_dtype(df[self.postal_code_column]) or is_object_dtype(df[self.postal_code_column]):
26
26
  try:
27
27
  df[self.postal_code_column] = (
28
- df[self.postal_code_column].astype("string").astype("Float64").astype("Int64").astype("string")
28
+ df[self.postal_code_column].astype("string").astype("float64").astype("Int64").astype("string")
29
29
  )
30
30
  except Exception:
31
31
  pass
@@ -194,4 +194,7 @@ def calculate_psi(expected: pd.Series, actual: pd.Series) -> float:
194
194
  test_distribution = actual.value_counts(bins=bins, normalize=True).sort_index().values
195
195
 
196
196
  # Calculate the PSI
197
- return np.sum((train_distribution - test_distribution) * np.log(train_distribution / test_distribution))
197
+ try:
198
+ return np.sum((train_distribution - test_distribution) * np.log(train_distribution / test_distribution))
199
+ except Exception:
200
+ return np.nan
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.1.317
3
+ Version: 1.2.0a1
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -29,9 +29,9 @@ Requires-Dist: ipywidgets>=8.1.0
29
29
  Requires-Dist: jarowinkler>=2.0.0
30
30
  Requires-Dist: levenshtein>=0.25.1
31
31
  Requires-Dist: lightgbm>=3.3.2
32
- Requires-Dist: numpy>=1.19.0
32
+ Requires-Dist: numpy<=1.26.4,>=1.19.0
33
33
  Requires-Dist: pandas<3.0.0,>=1.1.0
34
- Requires-Dist: pydantic<2.0.0,>=1.8.2
34
+ Requires-Dist: pydantic<3.0.0,>1.0.0
35
35
  Requires-Dist: pyjwt>=2.8.0
36
36
  Requires-Dist: python-bidi==0.4.2
37
37
  Requires-Dist: python-dateutil>=2.8.0
@@ -1,33 +1,33 @@
1
- upgini/__about__.py,sha256=7A4Mpkf8cSUSzwIJzMaQ6hlkjN2sldlyOHl5dtLNJkE,24
1
+ upgini/__about__.py,sha256=dMk28IuEJr_qWW7xH2uH2BnZ8G_djCORGGd6opmGetw,24
2
2
  upgini/__init__.py,sha256=Xs0YFVBu1KUdtZzbStGRPQtLt3YLzJnjx5nIUBlX8BE,415
3
3
  upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
4
- upgini/dataset.py,sha256=yAWIygHejxdKXOA4g3QjtCu0VRa9at-4nPPuugCr77U,30857
4
+ upgini/dataset.py,sha256=olZ-OHSfBNoBSCo7R5t7uCLukI2nO7afpx_A-HCiJLk,31067
5
5
  upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
6
- upgini/features_enricher.py,sha256=Gu4gsnMVjcsfWnJlu4Np3jpE9Au1UywhuHQb0Xv5YNg,187982
7
- upgini/http.py,sha256=a4Epc9YLIJBuYk4t8E_2-QDLBtJFqKO35jn2SnYQZCg,42920
8
- upgini/lazy_import.py,sha256=EwoM0msNGbSmWBhGbrLDny1DSnOlvTxCjmMKPxYlDms,610
9
- upgini/metadata.py,sha256=YQ-1HZGyPOksP2iM50ff_pMHXLyzvpChqSfNh8Z0ke4,10833
6
+ upgini/features_enricher.py,sha256=twH4qdl91iHZF_AraLk0aIbRDw61S_DYtCWCZ34Yjjg,188077
7
+ upgini/http.py,sha256=21asexflvavydzCOONJDGQBtQanCElrbnqLXakJ9Cu8,42880
8
+ upgini/lazy_import.py,sha256=74gQ8JuA48BGRLxAo7lNHNKY2D2emMxrUxKGdxVGhuY,1012
9
+ upgini/metadata.py,sha256=osmzdNESeh7yP3BZday6N9Q3eaIHfzhhRM1d6NSgcf0,11223
10
10
  upgini/metrics.py,sha256=Tu5cN8RlhOSSMWUTXRSkdl8SWBqR1N_2eJpBum9pZxc,30926
11
- upgini/search_task.py,sha256=LtRJ9bCPjMo1gJ-sUDKERhDwGcWKImrzwVFHjkMSQHQ,17071
11
+ upgini/search_task.py,sha256=qxUxAD-bed-FpZYmTB_4orW7YJsW_O6a1TcgnZIRFr4,17307
12
12
  upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
13
13
  upgini/version_validator.py,sha256=ddSKUK_-eGJB3NgrqOMoWJU-OxQ253WsNLp8aqJkaIM,1389
14
14
  upgini/ads_management/__init__.py,sha256=qzyisOToVRP-tquAJD1PblZhNtMrOB8FiyF9JvfkvgE,50
15
15
  upgini/ads_management/ads_manager.py,sha256=igVbN2jz80Umb2BUJixmJVj-zx8unoKpecVo-R-nGdw,2648
16
16
  upgini/autofe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
17
  upgini/autofe/all_operands.py,sha256=3LiH9iU-ArGmYpS8FHWH7yCFx40ILfvlSXJlKIa75BQ,2542
18
- upgini/autofe/binary.py,sha256=2Z5FrfdCtesKEHBuabEBiRvwOAzcRoFKAX1wvGpHL0I,7003
19
- upgini/autofe/date.py,sha256=ijB9RCh5wBwl03Nl8zDYA50gpL4sqmAkYVYzVPm1bn0,9070
18
+ upgini/autofe/binary.py,sha256=TRjEdxsfyPY5E8ksYfdKMmU6GtvALfGFPNVIG7DBhzM,7520
19
+ upgini/autofe/date.py,sha256=OpFc3Al0xO3qlESn2Uokfxw51ArVqmh3xngWwdrsaqE,9762
20
20
  upgini/autofe/feature.py,sha256=gwGWY2UcX_0wHAvfEiu1rRU7GFZyzMWZIaPVcf6kD80,14223
21
- upgini/autofe/groupby.py,sha256=4WjDzQxqpZxB79Ih4ihMMI5GDxaFqiH6ZelfV82ClT4,3091
22
- upgini/autofe/operand.py,sha256=MKEsl3zxpWzRDpTkE0sNJxTu62U20sWOvEKhPjUWS6s,2915
23
- upgini/autofe/unary.py,sha256=oIMf-IVy7L7GkzxMmQyExX0tOH9RhWeQh7cGxxMDiPk,3832
24
- upgini/autofe/vector.py,sha256=dLxfAstJs-gw_OQ1xxoxcM6pVzORlV0HVzdzt7cLXVQ,606
21
+ upgini/autofe/groupby.py,sha256=r-xl_keZZgm_tpiEoDhjYSkT6NHv7a4cRQR4wJ4uCp8,3263
22
+ upgini/autofe/operand.py,sha256=uk883RaNqgXqtkaRqA1re1d9OFnnpv0JVvelYx09Yw0,2943
23
+ upgini/autofe/unary.py,sha256=RiK-Fz3fgjPlqWWfro6x7qChjEZ8W8RTnl5-MT1kaQA,4218
24
+ upgini/autofe/vector.py,sha256=ehcZUDqV71TfbU8EmKfdYp603gS2dJY_-fpr10ho5sI,663
25
25
  upgini/data_source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
26
26
  upgini/data_source/data_source_publisher.py,sha256=Vg0biG86YB0OEaoxbK9YYrr4yARm11_h3bTWIBgoScA,22115
27
27
  upgini/mdc/__init__.py,sha256=aM08nIWFc2gWdWUa3_IuEnNND0cQPkBGnYpRMnfFN8k,1019
28
28
  upgini/mdc/context.py,sha256=3u1B-jXt7tXEvNcV3qmR9SDCseudnY7KYsLclBdwVLk,1405
29
29
  upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
30
- upgini/normalizer/normalize_utils.py,sha256=8gH1oabPNZrC1kHSRFxGGcO0o6yNDlOJXCLzzExq-3s,7451
30
+ upgini/normalizer/normalize_utils.py,sha256=bHRPWCNrUvt2R9qMX6dZFCJ0i8ENVCQ2Rw3dHH9IJEg,7447
31
31
  upgini/resource_bundle/__init__.py,sha256=S5F2G47pnJd2LDpmFsjDqEwiKkP8Hm-hcseDbMka6Ko,8345
32
32
  upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
33
33
  upgini/resource_bundle/strings.properties,sha256=WZAuYPX2Dpn6BHoA3RX8uvMNMr-yJE2fF7Gz0i24x2s,26459
@@ -42,7 +42,7 @@ upgini/utils/blocked_time_series.py,sha256=Uqr3vp4YqNclj2-PzEYqVy763GSXHn86sbpIl
42
42
  upgini/utils/country_utils.py,sha256=lY-eXWwFVegdVENFttbvLcgGDjFO17Sex8hd2PyJaRk,6937
43
43
  upgini/utils/custom_loss_utils.py,sha256=kieNZYBYZm5ZGBltF1F_jOSF4ea6C29rYuCyiDcqVNY,3857
44
44
  upgini/utils/cv_utils.py,sha256=w6FQb9nO8BWDx88EF83NpjPLarK4eR4ia0Wg0kLBJC4,3525
45
- upgini/utils/datetime_utils.py,sha256=niZcf2YqAwokUFUW474zajlzv9HAMf7nv9v_WPJHpyc,12123
45
+ upgini/utils/datetime_utils.py,sha256=4tsGeehU0KS6wqNsc9gEEWZ9s6T9E0UReUIO3rSuXNU,12174
46
46
  upgini/utils/deduplicate_utils.py,sha256=Zvs7zW4QzaERQmJNPrTVf2ZTVBkBLOycFCzyMwtXuV8,8770
47
47
  upgini/utils/display_utils.py,sha256=A2ouB5eiZ-Kyt9ykYxkLQwyoRPrdYeJymwNTiajtFXs,10990
48
48
  upgini/utils/email_utils.py,sha256=j0Ug1R_0AnCg1Y92zIZ4XMwvKo3G5_pcOlBN1OH_gZs,5191
@@ -50,14 +50,14 @@ upgini/utils/fallback_progress_bar.py,sha256=PDaKb8dYpVZaWMroNcOHsTc3pSjgi9mOm0-
50
50
  upgini/utils/features_validator.py,sha256=PgKNt5dyqfErTvjtRNNUS9g7GFqHBtAtnsfA-V5UO1A,3307
51
51
  upgini/utils/format.py,sha256=Yv5cvvSs2bOLUzzNu96Pu33VMDNbabio92QepUj41jU,243
52
52
  upgini/utils/ip_utils.py,sha256=ZZj_uQFTHhagzt-MRew__ZBOp2DdnkMrachS7PElkSE,5143
53
- upgini/utils/phone_utils.py,sha256=PTSRfGAWCuLy8R6I8X6clcc1K7bZXIIrZ_alIB8irC8,10368
54
- upgini/utils/postal_code_utils.py,sha256=C899tJS8qM_ps4I3g-Ve6qzIa22O_UqwNmGFoyy9sO8,1716
53
+ upgini/utils/phone_utils.py,sha256=IrbztLuOJBiePqqxllfABWfYlfAjYevPhXKipl95wUI,10432
54
+ upgini/utils/postal_code_utils.py,sha256=5M0sUqH2DAr33kARWCTXR-ACyzWbjDq_-0mmEml6ZcU,1716
55
55
  upgini/utils/progress_bar.py,sha256=N-Sfdah2Hg8lXP_fV9EfUTXz_PyRt4lo9fAHoUDOoLc,1550
56
56
  upgini/utils/sklearn_ext.py,sha256=13jQS_k7v0aUtudXV6nGUEWjttPQzAW9AFYL5wgEz9k,44511
57
- upgini/utils/target_utils.py,sha256=Y96_PJ5cC-WsEbeqg20v9uqywDQobLoTb-xoP7S3o4E,7807
57
+ upgini/utils/target_utils.py,sha256=BVtDmrmFMKerSUWaNOIEdzsYHIFiODdpnWbE50QDPDc,7864
58
58
  upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
59
59
  upgini/utils/warning_counter.py,sha256=dIWBB4dI5XRRJZudvIlqlIYKEiwLLPcXarsZuYRt338,227
60
- upgini-1.1.317.dist-info/METADATA,sha256=MAx5zlya3JBerLBEmC9me552zgexw4gy4Cfc2VuNzSg,48222
61
- upgini-1.1.317.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
62
- upgini-1.1.317.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
63
- upgini-1.1.317.dist-info/RECORD,,
60
+ upgini-1.2.0a1.dist-info/METADATA,sha256=568JisotupYzFolx0QDyv_qN5CtSIEuHuium23_SDp8,48230
61
+ upgini-1.2.0a1.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
62
+ upgini-1.2.0a1.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
63
+ upgini-1.2.0a1.dist-info/RECORD,,