upgini 1.1.315a3579.dev1__tar.gz → 1.1.316a1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

Files changed (68) hide show
  1. {upgini-1.1.315a3579.dev1 → upgini-1.1.316a1}/PKG-INFO +3 -3
  2. {upgini-1.1.315a3579.dev1 → upgini-1.1.316a1}/pyproject.toml +2 -2
  3. upgini-1.1.316a1/src/upgini/__about__.py +1 -0
  4. {upgini-1.1.315a3579.dev1 → upgini-1.1.316a1}/src/upgini/autofe/binary.py +71 -71
  5. {upgini-1.1.315a3579.dev1 → upgini-1.1.316a1}/src/upgini/autofe/date.py +21 -21
  6. {upgini-1.1.315a3579.dev1 → upgini-1.1.316a1}/src/upgini/autofe/feature.py +2 -2
  7. {upgini-1.1.315a3579.dev1 → upgini-1.1.316a1}/src/upgini/autofe/groupby.py +22 -22
  8. {upgini-1.1.315a3579.dev1 → upgini-1.1.316a1}/src/upgini/autofe/operand.py +4 -4
  9. {upgini-1.1.315a3579.dev1 → upgini-1.1.316a1}/src/upgini/autofe/unary.py +47 -46
  10. {upgini-1.1.315a3579.dev1 → upgini-1.1.316a1}/src/upgini/autofe/vector.py +8 -8
  11. {upgini-1.1.315a3579.dev1 → upgini-1.1.316a1}/src/upgini/data_source/data_source_publisher.py +9 -0
  12. {upgini-1.1.315a3579.dev1 → upgini-1.1.316a1}/src/upgini/dataset.py +34 -387
  13. {upgini-1.1.315a3579.dev1 → upgini-1.1.316a1}/src/upgini/features_enricher.py +338 -169
  14. {upgini-1.1.315a3579.dev1 → upgini-1.1.316a1}/src/upgini/http.py +20 -31
  15. {upgini-1.1.315a3579.dev1 → upgini-1.1.316a1}/src/upgini/lazy_import.py +14 -1
  16. {upgini-1.1.315a3579.dev1 → upgini-1.1.316a1}/src/upgini/metadata.py +72 -57
  17. upgini-1.1.316a1/src/upgini/normalizer/normalize_utils.py +202 -0
  18. {upgini-1.1.315a3579.dev1 → upgini-1.1.316a1}/src/upgini/utils/country_utils.py +16 -0
  19. {upgini-1.1.315a3579.dev1 → upgini-1.1.316a1}/src/upgini/utils/datetime_utils.py +41 -20
  20. {upgini-1.1.315a3579.dev1 → upgini-1.1.316a1}/src/upgini/utils/email_utils.py +49 -17
  21. upgini-1.1.316a1/src/upgini/utils/ip_utils.py +152 -0
  22. upgini-1.1.315a3579.dev1/src/upgini/normalizer/phone_normalizer.py → upgini-1.1.316a1/src/upgini/utils/phone_utils.py +41 -27
  23. upgini-1.1.316a1/src/upgini/utils/postal_code_utils.py +45 -0
  24. {upgini-1.1.315a3579.dev1 → upgini-1.1.316a1}/src/upgini/utils/target_utils.py +4 -1
  25. upgini-1.1.315a3579.dev1/src/upgini/__about__.py +0 -1
  26. upgini-1.1.315a3579.dev1/src/upgini/utils/ip_utils.py +0 -53
  27. upgini-1.1.315a3579.dev1/src/upgini/utils/phone_utils.py +0 -11
  28. upgini-1.1.315a3579.dev1/src/upgini/utils/postal_code_utils.py +0 -11
  29. {upgini-1.1.315a3579.dev1 → upgini-1.1.316a1}/.gitignore +0 -0
  30. {upgini-1.1.315a3579.dev1 → upgini-1.1.316a1}/LICENSE +0 -0
  31. {upgini-1.1.315a3579.dev1 → upgini-1.1.316a1}/README.md +0 -0
  32. {upgini-1.1.315a3579.dev1 → upgini-1.1.316a1}/src/upgini/__init__.py +0 -0
  33. {upgini-1.1.315a3579.dev1 → upgini-1.1.316a1}/src/upgini/ads.py +0 -0
  34. {upgini-1.1.315a3579.dev1 → upgini-1.1.316a1}/src/upgini/ads_management/__init__.py +0 -0
  35. {upgini-1.1.315a3579.dev1 → upgini-1.1.316a1}/src/upgini/ads_management/ads_manager.py +0 -0
  36. {upgini-1.1.315a3579.dev1 → upgini-1.1.316a1}/src/upgini/autofe/__init__.py +0 -0
  37. {upgini-1.1.315a3579.dev1 → upgini-1.1.316a1}/src/upgini/autofe/all_operands.py +0 -0
  38. {upgini-1.1.315a3579.dev1 → upgini-1.1.316a1}/src/upgini/data_source/__init__.py +0 -0
  39. {upgini-1.1.315a3579.dev1 → upgini-1.1.316a1}/src/upgini/errors.py +0 -0
  40. {upgini-1.1.315a3579.dev1 → upgini-1.1.316a1}/src/upgini/mdc/__init__.py +0 -0
  41. {upgini-1.1.315a3579.dev1 → upgini-1.1.316a1}/src/upgini/mdc/context.py +0 -0
  42. {upgini-1.1.315a3579.dev1 → upgini-1.1.316a1}/src/upgini/metrics.py +0 -0
  43. {upgini-1.1.315a3579.dev1 → upgini-1.1.316a1}/src/upgini/normalizer/__init__.py +0 -0
  44. {upgini-1.1.315a3579.dev1 → upgini-1.1.316a1}/src/upgini/resource_bundle/__init__.py +0 -0
  45. {upgini-1.1.315a3579.dev1 → upgini-1.1.316a1}/src/upgini/resource_bundle/exceptions.py +0 -0
  46. {upgini-1.1.315a3579.dev1 → upgini-1.1.316a1}/src/upgini/resource_bundle/strings.properties +0 -0
  47. {upgini-1.1.315a3579.dev1 → upgini-1.1.316a1}/src/upgini/resource_bundle/strings_widget.properties +0 -0
  48. {upgini-1.1.315a3579.dev1 → upgini-1.1.316a1}/src/upgini/sampler/__init__.py +0 -0
  49. {upgini-1.1.315a3579.dev1 → upgini-1.1.316a1}/src/upgini/sampler/base.py +0 -0
  50. {upgini-1.1.315a3579.dev1 → upgini-1.1.316a1}/src/upgini/sampler/random_under_sampler.py +0 -0
  51. {upgini-1.1.315a3579.dev1 → upgini-1.1.316a1}/src/upgini/sampler/utils.py +0 -0
  52. {upgini-1.1.315a3579.dev1 → upgini-1.1.316a1}/src/upgini/search_task.py +0 -0
  53. {upgini-1.1.315a3579.dev1 → upgini-1.1.316a1}/src/upgini/spinner.py +0 -0
  54. {upgini-1.1.315a3579.dev1 → upgini-1.1.316a1}/src/upgini/utils/__init__.py +0 -0
  55. {upgini-1.1.315a3579.dev1 → upgini-1.1.316a1}/src/upgini/utils/base_search_key_detector.py +0 -0
  56. {upgini-1.1.315a3579.dev1 → upgini-1.1.316a1}/src/upgini/utils/blocked_time_series.py +0 -0
  57. {upgini-1.1.315a3579.dev1 → upgini-1.1.316a1}/src/upgini/utils/custom_loss_utils.py +0 -0
  58. {upgini-1.1.315a3579.dev1 → upgini-1.1.316a1}/src/upgini/utils/cv_utils.py +0 -0
  59. {upgini-1.1.315a3579.dev1 → upgini-1.1.316a1}/src/upgini/utils/deduplicate_utils.py +0 -0
  60. {upgini-1.1.315a3579.dev1 → upgini-1.1.316a1}/src/upgini/utils/display_utils.py +0 -0
  61. {upgini-1.1.315a3579.dev1 → upgini-1.1.316a1}/src/upgini/utils/fallback_progress_bar.py +0 -0
  62. {upgini-1.1.315a3579.dev1 → upgini-1.1.316a1}/src/upgini/utils/features_validator.py +0 -0
  63. {upgini-1.1.315a3579.dev1 → upgini-1.1.316a1}/src/upgini/utils/format.py +0 -0
  64. {upgini-1.1.315a3579.dev1 → upgini-1.1.316a1}/src/upgini/utils/progress_bar.py +0 -0
  65. {upgini-1.1.315a3579.dev1 → upgini-1.1.316a1}/src/upgini/utils/sklearn_ext.py +0 -0
  66. {upgini-1.1.315a3579.dev1 → upgini-1.1.316a1}/src/upgini/utils/track_info.py +0 -0
  67. {upgini-1.1.315a3579.dev1 → upgini-1.1.316a1}/src/upgini/utils/warning_counter.py +0 -0
  68. {upgini-1.1.315a3579.dev1 → upgini-1.1.316a1}/src/upgini/version_validator.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.1.315a3579.dev1
3
+ Version: 1.1.316a1
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -29,9 +29,9 @@ Requires-Dist: ipywidgets>=8.1.0
29
29
  Requires-Dist: jarowinkler>=2.0.0
30
30
  Requires-Dist: levenshtein>=0.25.1
31
31
  Requires-Dist: lightgbm>=3.3.2
32
- Requires-Dist: numpy>=1.19.0
32
+ Requires-Dist: numpy<=1.26.4,>=1.19.0
33
33
  Requires-Dist: pandas<3.0.0,>=1.1.0
34
- Requires-Dist: pydantic<2.0.0,>=1.8.2
34
+ Requires-Dist: pydantic>=2.7.0
35
35
  Requires-Dist: pyjwt>=2.8.0
36
36
  Requires-Dist: python-bidi==0.4.2
37
37
  Requires-Dist: python-dateutil>=2.8.0
@@ -39,9 +39,9 @@ dependencies = [
39
39
  "fastparquet>=0.8.1",
40
40
  "ipywidgets>=8.1.0",
41
41
  "lightgbm>=3.3.2",
42
- "numpy>=1.19.0",
42
+ "numpy>=1.19.0,<=1.26.4",
43
43
  "pandas>=1.1.0,<3.0.0",
44
- "pydantic>=1.8.2,<2.0.0",
44
+ "pydantic>=2.7.0",
45
45
  "pyjwt>=2.8.0",
46
46
  "python-dateutil>=2.8.0",
47
47
  "python-json-logger>=2.0.2",
@@ -0,0 +1 @@
1
+ __version__ = "1.1.316a1"
@@ -9,32 +9,32 @@ from upgini.autofe.operand import PandasOperand, VectorizableMixin
9
9
 
10
10
 
11
11
  class Min(PandasOperand):
12
- name = "min"
13
- is_binary = True
14
- is_symmetrical = True
15
- has_symmetry_importance = True
12
+ name: str = "min"
13
+ is_binary: bool = True
14
+ is_symmetrical: bool = True
15
+ has_symmetry_importance: bool = True
16
16
 
17
17
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
18
18
  return np.minimum(left, right)
19
19
 
20
20
 
21
21
  class Max(PandasOperand):
22
- name = "max"
23
- is_binary = True
24
- is_symmetrical = True
25
- has_symmetry_importance = True
22
+ name: str = "max"
23
+ is_binary: bool = True
24
+ is_symmetrical: bool = True
25
+ has_symmetry_importance: bool = True
26
26
 
27
27
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
28
28
  return np.maximum(left, right)
29
29
 
30
30
 
31
31
  class Add(PandasOperand, VectorizableMixin):
32
- name = "+"
33
- alias = "add"
34
- is_binary = True
35
- is_symmetrical = True
36
- has_symmetry_importance = True
37
- is_vectorizable = True
32
+ name: str = "+"
33
+ alias: str = "add"
34
+ is_binary: bool = True
35
+ is_symmetrical: bool = True
36
+ has_symmetry_importance: bool = True
37
+ is_vectorizable: bool = True
38
38
 
39
39
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
40
40
  return left + right
@@ -48,12 +48,12 @@ class Add(PandasOperand, VectorizableMixin):
48
48
 
49
49
 
50
50
  class Subtract(PandasOperand, VectorizableMixin):
51
- name = "-"
52
- alias = "sub"
53
- is_binary = True
54
- is_symmetrical = True
55
- has_symmetry_importance = True
56
- is_vectorizable = True
51
+ name: str = "-"
52
+ alias: str = "sub"
53
+ is_binary: bool = True
54
+ is_symmetrical: bool = True
55
+ has_symmetry_importance: bool = True
56
+ is_vectorizable: bool = True
57
57
 
58
58
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
59
59
  return left - right
@@ -67,12 +67,12 @@ class Subtract(PandasOperand, VectorizableMixin):
67
67
 
68
68
 
69
69
  class Multiply(PandasOperand, VectorizableMixin):
70
- name = "*"
71
- alias = "mul"
72
- is_binary = True
73
- is_symmetrical = True
74
- has_symmetry_importance = True
75
- is_vectorizable = True
70
+ name: str = "*"
71
+ alias: str = "mul"
72
+ is_binary: bool = True
73
+ is_symmetrical: bool = True
74
+ has_symmetry_importance: bool = True
75
+ is_vectorizable: bool = True
76
76
 
77
77
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
78
78
  return left * right
@@ -86,12 +86,12 @@ class Multiply(PandasOperand, VectorizableMixin):
86
86
 
87
87
 
88
88
  class Divide(PandasOperand, VectorizableMixin):
89
- name = "/"
90
- alias = "div"
91
- is_binary = True
92
- has_symmetry_importance = True
93
- is_vectorizable = True
94
- output_type = "float"
89
+ name: str = "/"
90
+ alias: str = "div"
91
+ is_binary: bool = True
92
+ has_symmetry_importance: bool = True
93
+ is_vectorizable: bool = True
94
+ output_type: Optional[str] = "float"
95
95
 
96
96
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
97
97
  return left / right.replace(0, np.nan)
@@ -105,10 +105,10 @@ class Divide(PandasOperand, VectorizableMixin):
105
105
 
106
106
 
107
107
  class Combine(PandasOperand):
108
- name = "Combine"
109
- is_binary = True
110
- has_symmetry_importance = True
111
- output_type = "object"
108
+ name: str = "Combine"
109
+ is_binary: bool = True
110
+ has_symmetry_importance: bool = True
111
+ output_type: Optional[str] = "object"
112
112
 
113
113
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
114
114
  temp = left.astype(str) + "_" + right.astype(str)
@@ -117,13 +117,13 @@ class Combine(PandasOperand):
117
117
 
118
118
 
119
119
  class CombineThenFreq(PandasOperand):
120
- name = "CombineThenFreq"
121
- is_binary = True
122
- is_symmetrical = True
123
- has_symmetry_importance = True
124
- output_type = "float"
125
- is_distribution_dependent = True
126
- input_type = "discrete"
120
+ name: str = "CombineThenFreq"
121
+ is_binary: bool = True
122
+ is_symmetrical: bool = True
123
+ has_symmetry_importance: bool = True
124
+ output_type: Optional[str] = "float"
125
+ is_distribution_dependent: bool = True
126
+ input_type: Optional[str] = "discrete"
127
127
 
128
128
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
129
129
  temp = left.astype(str) + "_" + right.astype(str)
@@ -133,11 +133,11 @@ class CombineThenFreq(PandasOperand):
133
133
 
134
134
 
135
135
  class Distance(PandasOperand):
136
- name = "dist"
137
- is_binary = True
138
- output_type = "float"
139
- is_symmetrical = True
140
- has_symmetry_importance = True
136
+ name: str = "dist"
137
+ is_binary: bool = True
138
+ output_type: Optional[str] = "float"
139
+ is_symmetrical: bool = True
140
+ has_symmetry_importance: bool = True
141
141
 
142
142
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
143
143
  return pd.Series(
@@ -155,11 +155,11 @@ class Distance(PandasOperand):
155
155
 
156
156
  # Left for backward compatibility
157
157
  class Sim(Distance):
158
- name = "sim"
159
- is_binary = True
160
- output_type = "float"
161
- is_symmetrical = True
162
- has_symmetry_importance = True
158
+ name: str = "sim"
159
+ is_binary: bool = True
160
+ output_type: Optional[str] = "float"
161
+ is_symmetrical: bool = True
162
+ has_symmetry_importance: bool = True
163
163
 
164
164
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
165
165
  return 1 - super().calculate_binary(left, right)
@@ -188,12 +188,12 @@ class StringSim(PandasOperand, abc.ABC):
188
188
 
189
189
 
190
190
  class JaroWinklerSim1(StringSim):
191
- name = "sim_jw1"
192
- is_binary = True
193
- input_type = "string"
194
- output_type = "float"
195
- is_symmetrical = True
196
- has_symmetry_importance = True
191
+ name: str = "sim_jw1"
192
+ is_binary: bool = True
193
+ input_type: Optional[str] = "string"
194
+ output_type: Optional[str] = "float"
195
+ is_symmetrical: bool = True
196
+ has_symmetry_importance: bool = True
197
197
 
198
198
  def _prepare_value(self, value: Optional[str]) -> Optional[str]:
199
199
  return value
@@ -203,12 +203,12 @@ class JaroWinklerSim1(StringSim):
203
203
 
204
204
 
205
205
  class JaroWinklerSim2(StringSim):
206
- name = "sim_jw2"
207
- is_binary = True
208
- input_type = "string"
209
- output_type = "float"
210
- is_symmetrical = True
211
- has_symmetry_importance = True
206
+ name: str = "sim_jw2"
207
+ is_binary: bool = True
208
+ input_type: Optional[str] = "string"
209
+ output_type: Optional[str] = "float"
210
+ is_symmetrical: bool = True
211
+ has_symmetry_importance: bool = True
212
212
 
213
213
  def _prepare_value(self, value: Optional[str]) -> Optional[str]:
214
214
  return value[::-1] if value is not None else None
@@ -218,12 +218,12 @@ class JaroWinklerSim2(StringSim):
218
218
 
219
219
 
220
220
  class LevenshteinSim(StringSim):
221
- name = "sim_lv"
222
- is_binary = True
223
- input_type = "string"
224
- output_type = "float"
225
- is_symmetrical = True
226
- has_symmetry_importance = True
221
+ name: str = "sim_lv"
222
+ is_binary: bool = True
223
+ input_type: Optional[str] = "string"
224
+ output_type: Optional[str] = "float"
225
+ is_symmetrical: bool = True
226
+ has_symmetry_importance: bool = True
227
227
 
228
228
  def _prepare_value(self, value: Optional[str]) -> Optional[str]:
229
229
  return value
@@ -4,7 +4,7 @@ from typing import Any, Dict, List, Optional, Union
4
4
  import numpy as np
5
5
  import pandas as pd
6
6
  from pandas.core.arrays.timedeltas import TimedeltaArray
7
- from pydantic import BaseModel, validator
7
+ from pydantic import BaseModel, field_validator
8
8
 
9
9
  from upgini.autofe.operand import PandasOperand
10
10
 
@@ -38,10 +38,10 @@ class DateDiffMixin(BaseModel):
38
38
 
39
39
 
40
40
  class DateDiff(PandasOperand, DateDiffMixin):
41
- name = "date_diff"
42
- alias = "date_diff_type1"
43
- is_binary = True
44
- has_symmetry_importance = True
41
+ name: str = "date_diff"
42
+ alias: Optional[str] = "date_diff_type1"
43
+ is_binary: bool = True
44
+ has_symmetry_importance: bool = True
45
45
 
46
46
  replace_negative: bool = False
47
47
 
@@ -70,9 +70,9 @@ class DateDiff(PandasOperand, DateDiffMixin):
70
70
 
71
71
 
72
72
  class DateDiffType2(PandasOperand, DateDiffMixin):
73
- name = "date_diff_type2"
74
- is_binary = True
75
- has_symmetry_importance = True
73
+ name: str = "date_diff_type2"
74
+ is_binary: bool = True
75
+ has_symmetry_importance: bool = True
76
76
 
77
77
  def get_params(self) -> Dict[str, Optional[str]]:
78
78
  res = super().get_params()
@@ -104,8 +104,8 @@ _count_aggregations = ["nunique", "count"]
104
104
 
105
105
 
106
106
  class DateListDiff(PandasOperand, DateDiffMixin):
107
- is_binary = True
108
- has_symmetry_importance = True
107
+ is_binary: bool = True
108
+ has_symmetry_importance: bool = True
109
109
 
110
110
  aggregation: str
111
111
  replace_negative: bool = False
@@ -165,8 +165,8 @@ class DateListDiff(PandasOperand, DateDiffMixin):
165
165
 
166
166
 
167
167
  class DateListDiffBounded(DateListDiff):
168
- lower_bound: Optional[int]
169
- upper_bound: Optional[int]
168
+ lower_bound: Optional[int] = None
169
+ upper_bound: Optional[int] = None
170
170
 
171
171
  def __init__(self, **data: Any) -> None:
172
172
  if "name" not in data:
@@ -191,8 +191,8 @@ class DateListDiffBounded(DateListDiff):
191
191
 
192
192
 
193
193
  class DatePercentileBase(PandasOperand, abc.ABC):
194
- is_binary = True
195
- output_type = "float"
194
+ is_binary: bool = True
195
+ output_type: Optional[str] = "float"
196
196
 
197
197
  date_unit: Optional[str] = None
198
198
 
@@ -226,12 +226,12 @@ class DatePercentileBase(PandasOperand, abc.ABC):
226
226
 
227
227
 
228
228
  class DatePercentile(DatePercentileBase):
229
- name = "date_per"
230
- alias = "date_per_method1"
229
+ name: str = "date_per"
230
+ alias: Optional[str] = "date_per_method1"
231
231
 
232
- zero_month: Optional[int]
233
- zero_year: Optional[int]
234
- zero_bounds: Optional[List[float]]
232
+ zero_month: Optional[int] = None
233
+ zero_year: Optional[int] = None
234
+ zero_bounds: Optional[List[float]] = None
235
235
  step: int = 30
236
236
 
237
237
  def get_params(self) -> Dict[str, Optional[str]]:
@@ -246,7 +246,7 @@ class DatePercentile(DatePercentileBase):
246
246
  )
247
247
  return res
248
248
 
249
- @validator("zero_bounds", pre=True)
249
+ @field_validator("zero_bounds", mode="before")
250
250
  def validate_bounds(cls, value):
251
251
  if value is None or isinstance(value, list):
252
252
  return value
@@ -264,7 +264,7 @@ class DatePercentile(DatePercentileBase):
264
264
 
265
265
 
266
266
  class DatePercentileMethod2(DatePercentileBase):
267
- name = "date_per_method2"
267
+ name: str = "date_per_method2"
268
268
 
269
269
  def _get_bounds(self, date_col: pd.Series) -> pd.Series:
270
270
  pass
@@ -82,9 +82,9 @@ class Feature:
82
82
  self.alias = alias
83
83
 
84
84
  def set_op_params(self, params: Optional[Dict[str, str]]) -> "Feature":
85
- obj_dict = self.op.dict().copy()
85
+ obj_dict = self.op.model_dump().copy()
86
86
  obj_dict.update(params or {})
87
- self.op = self.op.__class__.parse_obj(obj_dict)
87
+ self.op = self.op.__class__.model_validate(obj_dict)
88
88
  self.op.set_params(params)
89
89
 
90
90
  for child in self.children:
@@ -7,9 +7,9 @@ from upgini.autofe.operand import PandasOperand, VectorizableMixin
7
7
 
8
8
  class GroupByThenAgg(PandasOperand, VectorizableMixin):
9
9
  agg: Optional[str]
10
- is_vectorizable = True
11
- is_grouping = True
12
- is_distribution_dependent = True
10
+ is_vectorizable: bool = True
11
+ is_grouping: bool = True
12
+ is_distribution_dependent: bool = True
13
13
 
14
14
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
15
15
  temp = left.groupby(right).agg(self.agg)
@@ -24,17 +24,17 @@ class GroupByThenAgg(PandasOperand, VectorizableMixin):
24
24
 
25
25
 
26
26
  class GroupByThenMedian(GroupByThenAgg):
27
- name = "GroupByThenMedian"
28
- pandas_agg = "median"
29
- is_distribution_dependent = True
27
+ name: str = "GroupByThenMedian"
28
+ pandas_agg: str = "median"
29
+ is_distribution_dependent: bool = True
30
30
 
31
31
 
32
32
  class GroupByThenRank(PandasOperand, VectorizableMixin):
33
- name = "GroupByThenRank"
34
- is_vectorizable = True
35
- is_grouping = True
36
- output_type = "float"
37
- is_distribution_dependent = True
33
+ name: str = "GroupByThenRank"
34
+ is_vectorizable: bool = True
35
+ is_grouping: bool = True
36
+ output_type: Optional[str] = "float"
37
+ is_distribution_dependent: bool = True
38
38
 
39
39
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
40
40
  temp = pd.DataFrame(left[~right.isna()].groupby(right).rank(ascending=True, pct=True)).reset_index()
@@ -49,12 +49,12 @@ class GroupByThenRank(PandasOperand, VectorizableMixin):
49
49
 
50
50
 
51
51
  class GroupByThenNUnique(PandasOperand, VectorizableMixin):
52
- name = "GroupByThenNUnique"
53
- is_vectorizable = True
54
- is_grouping = True
55
- output_type = "int"
56
- is_distribution_dependent = True
57
- input_type = "discrete"
52
+ name: str = "GroupByThenNUnique"
53
+ is_vectorizable: bool = True
54
+ is_grouping: bool = True
55
+ output_type: Optional[str] = "int"
56
+ is_distribution_dependent: bool = True
57
+ input_type: Optional[str] = "discrete"
58
58
 
59
59
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
60
60
  nunique = left.groupby(right).nunique()
@@ -69,11 +69,11 @@ class GroupByThenNUnique(PandasOperand, VectorizableMixin):
69
69
 
70
70
 
71
71
  class GroupByThenFreq(PandasOperand):
72
- name = "GroupByThenFreq"
73
- is_grouping = True
74
- output_type = "float"
75
- is_distribution_dependent = True
76
- input_type = "discrete"
72
+ name: str = "GroupByThenFreq"
73
+ is_grouping: bool = True
74
+ output_type: Optional[str] = "float"
75
+ is_distribution_dependent: bool = True
76
+ input_type: Optional[str] = "discrete"
77
77
 
78
78
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
79
79
  def _f(x):
@@ -8,19 +8,19 @@ from pydantic import BaseModel
8
8
 
9
9
  class Operand(BaseModel):
10
10
  name: str
11
- alias: Optional[str]
11
+ alias: Optional[str] = None
12
12
  is_unary: bool = False
13
13
  is_symmetrical: bool = False
14
14
  has_symmetry_importance: bool = False
15
- input_type: Optional[str]
16
- output_type: Optional[str]
15
+ input_type: Optional[str] = None
16
+ output_type: Optional[str] = None
17
17
  is_categorical: bool = False
18
18
  is_vectorizable: bool = False
19
19
  is_grouping: bool = False
20
20
  is_binary: bool = False
21
21
  is_vector: bool = False
22
22
  is_distribution_dependent: bool = False
23
- params: Optional[Dict[str, str]]
23
+ params: Optional[Dict[str, str]] = None
24
24
 
25
25
  def set_params(self, params: Dict[str, str]):
26
26
  self.params = params
@@ -1,3 +1,4 @@
1
+ from typing import Optional
1
2
  import numpy as np
2
3
  import pandas as pd
3
4
  from sklearn.preprocessing import Normalizer
@@ -6,10 +7,10 @@ from upgini.autofe.operand import PandasOperand, VectorizableMixin
6
7
 
7
8
 
8
9
  class Abs(PandasOperand, VectorizableMixin):
9
- name = "abs"
10
- is_unary = True
11
- is_vectorizable = True
12
- group_index = 0
10
+ name: str = "abs"
11
+ is_unary: bool = True
12
+ is_vectorizable: bool = True
13
+ group_index: int = 0
13
14
 
14
15
  def calculate_unary(self, data: pd.Series) -> pd.Series:
15
16
  return data.abs()
@@ -19,11 +20,11 @@ class Abs(PandasOperand, VectorizableMixin):
19
20
 
20
21
 
21
22
  class Log(PandasOperand, VectorizableMixin):
22
- name = "log"
23
- is_unary = True
24
- is_vectorizable = True
25
- output_type = "float"
26
- group_index = 0
23
+ name: str = "log"
24
+ is_unary: bool = True
25
+ is_vectorizable: bool = True
26
+ output_type: Optional[str] = "float"
27
+ group_index: int = 0
27
28
 
28
29
  def calculate_unary(self, data: pd.Series) -> pd.Series:
29
30
  return self._round_value(np.log(np.abs(data.replace(0, np.nan))), 10)
@@ -33,11 +34,11 @@ class Log(PandasOperand, VectorizableMixin):
33
34
 
34
35
 
35
36
  class Sqrt(PandasOperand, VectorizableMixin):
36
- name = "sqrt"
37
- is_unary = True
38
- is_vectorizable = True
39
- output_type = "float"
40
- group_index = 0
37
+ name: str = "sqrt"
38
+ is_unary: bool = True
39
+ is_vectorizable: bool = True
40
+ output_type: Optional[str] = "float"
41
+ group_index: int = 0
41
42
 
42
43
  def calculate_unary(self, data: pd.Series) -> pd.Series:
43
44
  return self._round_value(np.sqrt(np.abs(data)))
@@ -47,10 +48,10 @@ class Sqrt(PandasOperand, VectorizableMixin):
47
48
 
48
49
 
49
50
  class Square(PandasOperand, VectorizableMixin):
50
- name = "square"
51
- is_unary = True
52
- is_vectorizable = True
53
- group_index = 0
51
+ name: str = "square"
52
+ is_unary: bool = True
53
+ is_vectorizable: bool = True
54
+ group_index: int = 0
54
55
 
55
56
  def calculate_unary(self, data: pd.Series) -> pd.Series:
56
57
  return np.square(data)
@@ -60,11 +61,11 @@ class Square(PandasOperand, VectorizableMixin):
60
61
 
61
62
 
62
63
  class Sigmoid(PandasOperand, VectorizableMixin):
63
- name = "sigmoid"
64
- is_unary = True
65
- is_vectorizable = True
66
- output_type = "float"
67
- group_index = 0
64
+ name: str = "sigmoid"
65
+ is_unary: bool = True
66
+ is_vectorizable: bool = True
67
+ output_type: Optional[str] = "float"
68
+ group_index: int = 0
68
69
 
69
70
  def calculate_unary(self, data: pd.Series) -> pd.Series:
70
71
  return self._round_value(1 / (1 + np.exp(-data)))
@@ -74,12 +75,12 @@ class Sigmoid(PandasOperand, VectorizableMixin):
74
75
 
75
76
 
76
77
  class Floor(PandasOperand, VectorizableMixin):
77
- name = "floor"
78
- is_unary = True
79
- is_vectorizable = True
80
- output_type = "int"
81
- input_type = "continuous"
82
- group_index = 0
78
+ name: str = "floor"
79
+ is_unary: bool = True
80
+ is_vectorizable: bool = True
81
+ output_type: Optional[str] = "int"
82
+ input_type: Optional[str] = "continuous"
83
+ group_index: int = 0
83
84
 
84
85
  def calculate_unary(self, data: pd.Series) -> pd.Series:
85
86
  return np.floor(data)
@@ -89,11 +90,11 @@ class Floor(PandasOperand, VectorizableMixin):
89
90
 
90
91
 
91
92
  class Residual(PandasOperand, VectorizableMixin):
92
- name = "residual"
93
- is_unary = True
94
- is_vectorizable = True
95
- input_type = "continuous"
96
- group_index = 0
93
+ name: str = "residual"
94
+ is_unary: bool = True
95
+ is_vectorizable: bool = True
96
+ input_type: Optional[str] = "continuous"
97
+ group_index: int = 0
97
98
 
98
99
  def calculate_unary(self, data: pd.Series) -> pd.Series:
99
100
  return data - np.floor(data)
@@ -103,11 +104,11 @@ class Residual(PandasOperand, VectorizableMixin):
103
104
 
104
105
 
105
106
  class Freq(PandasOperand):
106
- name = "freq"
107
- is_unary = True
108
- output_type = "float"
109
- is_distribution_dependent = True
110
- input_type = "discrete"
107
+ name: str = "freq"
108
+ is_unary: bool = True
109
+ output_type: Optional[str] = "float"
110
+ is_distribution_dependent: bool = True
111
+ input_type: Optional[str] = "discrete"
111
112
 
112
113
  def calculate_unary(self, data: pd.Series) -> pd.Series:
113
114
  value_counts = data.value_counts(normalize=True)
@@ -115,9 +116,9 @@ class Freq(PandasOperand):
115
116
 
116
117
 
117
118
  class Norm(PandasOperand):
118
- name = "norm"
119
- is_unary = True
120
- output_type = "float"
119
+ name: str = "norm"
120
+ is_unary: bool = True
121
+ output_type: Optional[str] = "float"
121
122
 
122
123
  def calculate_unary(self, data: pd.Series) -> pd.Series:
123
124
  data_dropna = data.dropna()
@@ -131,7 +132,7 @@ class Norm(PandasOperand):
131
132
 
132
133
 
133
134
  class Embeddings(PandasOperand):
134
- name = "emb"
135
- is_unary = True
136
- input_type = "string"
137
- output_type = "vector"
135
+ name: str = "emb"
136
+ is_unary: bool = True
137
+ input_type: Optional[str] = "string"
138
+ output_type: Optional[str] = "vector"
@@ -1,4 +1,4 @@
1
- from typing import List
1
+ from typing import List, Optional
2
2
 
3
3
  import pandas as pd
4
4
 
@@ -6,19 +6,19 @@ from upgini.autofe.operand import PandasOperand, VectorizableMixin
6
6
 
7
7
 
8
8
  class Mean(PandasOperand, VectorizableMixin):
9
- name = "mean"
10
- output_type = "float"
11
- is_vector = True
12
- group_index = 0
9
+ name: str = "mean"
10
+ output_type: Optional[str] = "float"
11
+ is_vector: bool = True
12
+ group_index: int = 0
13
13
 
14
14
  def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
15
15
  return pd.DataFrame(data).T.fillna(0).mean(axis=1)
16
16
 
17
17
 
18
18
  class Sum(PandasOperand, VectorizableMixin):
19
- name = "sum"
20
- is_vector = True
21
- group_index = 0
19
+ name: str = "sum"
20
+ is_vector: bool = True
21
+ group_index: int = 0
22
22
 
23
23
  def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
24
24
  return pd.DataFrame(data).T.fillna(0).sum(axis=1)