upgini 1.1.317__tar.gz → 1.2.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

Files changed (65) hide show
  1. {upgini-1.1.317 → upgini-1.2.1}/PKG-INFO +3 -3
  2. {upgini-1.1.317 → upgini-1.2.1}/pyproject.toml +7 -5
  3. upgini-1.2.1/src/upgini/__about__.py +1 -0
  4. {upgini-1.1.317 → upgini-1.2.1}/src/upgini/autofe/binary.py +71 -71
  5. {upgini-1.1.317 → upgini-1.2.1}/src/upgini/autofe/date.py +43 -25
  6. {upgini-1.1.317 → upgini-1.2.1}/src/upgini/autofe/groupby.py +22 -22
  7. {upgini-1.1.317 → upgini-1.2.1}/src/upgini/autofe/operand.py +4 -4
  8. {upgini-1.1.317 → upgini-1.2.1}/src/upgini/autofe/unary.py +65 -50
  9. {upgini-1.1.317 → upgini-1.2.1}/src/upgini/autofe/vector.py +8 -8
  10. {upgini-1.1.317 → upgini-1.2.1}/src/upgini/dataset.py +8 -3
  11. {upgini-1.1.317 → upgini-1.2.1}/src/upgini/features_enricher.py +6 -4
  12. {upgini-1.1.317 → upgini-1.2.1}/src/upgini/http.py +15 -15
  13. {upgini-1.1.317 → upgini-1.2.1}/src/upgini/lazy_import.py +14 -1
  14. {upgini-1.1.317 → upgini-1.2.1}/src/upgini/metadata.py +57 -57
  15. {upgini-1.1.317 → upgini-1.2.1}/src/upgini/normalizer/normalize_utils.py +1 -2
  16. {upgini-1.1.317 → upgini-1.2.1}/src/upgini/search_task.py +10 -4
  17. {upgini-1.1.317 → upgini-1.2.1}/src/upgini/utils/datetime_utils.py +5 -5
  18. {upgini-1.1.317 → upgini-1.2.1}/src/upgini/utils/phone_utils.py +5 -7
  19. {upgini-1.1.317 → upgini-1.2.1}/src/upgini/utils/postal_code_utils.py +1 -1
  20. {upgini-1.1.317 → upgini-1.2.1}/src/upgini/utils/target_utils.py +4 -1
  21. upgini-1.1.317/src/upgini/__about__.py +0 -1
  22. {upgini-1.1.317 → upgini-1.2.1}/.gitignore +0 -0
  23. {upgini-1.1.317 → upgini-1.2.1}/LICENSE +0 -0
  24. {upgini-1.1.317 → upgini-1.2.1}/README.md +0 -0
  25. {upgini-1.1.317 → upgini-1.2.1}/src/upgini/__init__.py +0 -0
  26. {upgini-1.1.317 → upgini-1.2.1}/src/upgini/ads.py +0 -0
  27. {upgini-1.1.317 → upgini-1.2.1}/src/upgini/ads_management/__init__.py +0 -0
  28. {upgini-1.1.317 → upgini-1.2.1}/src/upgini/ads_management/ads_manager.py +0 -0
  29. {upgini-1.1.317 → upgini-1.2.1}/src/upgini/autofe/__init__.py +0 -0
  30. {upgini-1.1.317 → upgini-1.2.1}/src/upgini/autofe/all_operands.py +0 -0
  31. {upgini-1.1.317 → upgini-1.2.1}/src/upgini/autofe/feature.py +0 -0
  32. {upgini-1.1.317 → upgini-1.2.1}/src/upgini/data_source/__init__.py +0 -0
  33. {upgini-1.1.317 → upgini-1.2.1}/src/upgini/data_source/data_source_publisher.py +0 -0
  34. {upgini-1.1.317 → upgini-1.2.1}/src/upgini/errors.py +0 -0
  35. {upgini-1.1.317 → upgini-1.2.1}/src/upgini/mdc/__init__.py +0 -0
  36. {upgini-1.1.317 → upgini-1.2.1}/src/upgini/mdc/context.py +0 -0
  37. {upgini-1.1.317 → upgini-1.2.1}/src/upgini/metrics.py +0 -0
  38. {upgini-1.1.317 → upgini-1.2.1}/src/upgini/normalizer/__init__.py +0 -0
  39. {upgini-1.1.317 → upgini-1.2.1}/src/upgini/resource_bundle/__init__.py +0 -0
  40. {upgini-1.1.317 → upgini-1.2.1}/src/upgini/resource_bundle/exceptions.py +0 -0
  41. {upgini-1.1.317 → upgini-1.2.1}/src/upgini/resource_bundle/strings.properties +0 -0
  42. {upgini-1.1.317 → upgini-1.2.1}/src/upgini/resource_bundle/strings_widget.properties +0 -0
  43. {upgini-1.1.317 → upgini-1.2.1}/src/upgini/sampler/__init__.py +0 -0
  44. {upgini-1.1.317 → upgini-1.2.1}/src/upgini/sampler/base.py +0 -0
  45. {upgini-1.1.317 → upgini-1.2.1}/src/upgini/sampler/random_under_sampler.py +0 -0
  46. {upgini-1.1.317 → upgini-1.2.1}/src/upgini/sampler/utils.py +0 -0
  47. {upgini-1.1.317 → upgini-1.2.1}/src/upgini/spinner.py +0 -0
  48. {upgini-1.1.317 → upgini-1.2.1}/src/upgini/utils/__init__.py +0 -0
  49. {upgini-1.1.317 → upgini-1.2.1}/src/upgini/utils/base_search_key_detector.py +0 -0
  50. {upgini-1.1.317 → upgini-1.2.1}/src/upgini/utils/blocked_time_series.py +0 -0
  51. {upgini-1.1.317 → upgini-1.2.1}/src/upgini/utils/country_utils.py +0 -0
  52. {upgini-1.1.317 → upgini-1.2.1}/src/upgini/utils/custom_loss_utils.py +0 -0
  53. {upgini-1.1.317 → upgini-1.2.1}/src/upgini/utils/cv_utils.py +0 -0
  54. {upgini-1.1.317 → upgini-1.2.1}/src/upgini/utils/deduplicate_utils.py +0 -0
  55. {upgini-1.1.317 → upgini-1.2.1}/src/upgini/utils/display_utils.py +0 -0
  56. {upgini-1.1.317 → upgini-1.2.1}/src/upgini/utils/email_utils.py +0 -0
  57. {upgini-1.1.317 → upgini-1.2.1}/src/upgini/utils/fallback_progress_bar.py +0 -0
  58. {upgini-1.1.317 → upgini-1.2.1}/src/upgini/utils/features_validator.py +0 -0
  59. {upgini-1.1.317 → upgini-1.2.1}/src/upgini/utils/format.py +0 -0
  60. {upgini-1.1.317 → upgini-1.2.1}/src/upgini/utils/ip_utils.py +0 -0
  61. {upgini-1.1.317 → upgini-1.2.1}/src/upgini/utils/progress_bar.py +0 -0
  62. {upgini-1.1.317 → upgini-1.2.1}/src/upgini/utils/sklearn_ext.py +0 -0
  63. {upgini-1.1.317 → upgini-1.2.1}/src/upgini/utils/track_info.py +0 -0
  64. {upgini-1.1.317 → upgini-1.2.1}/src/upgini/utils/warning_counter.py +0 -0
  65. {upgini-1.1.317 → upgini-1.2.1}/src/upgini/version_validator.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.1.317
3
+ Version: 1.2.1
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -29,9 +29,9 @@ Requires-Dist: ipywidgets>=8.1.0
29
29
  Requires-Dist: jarowinkler>=2.0.0
30
30
  Requires-Dist: levenshtein>=0.25.1
31
31
  Requires-Dist: lightgbm>=3.3.2
32
- Requires-Dist: numpy>=1.19.0
32
+ Requires-Dist: numpy<=1.26.4,>=1.19.0
33
33
  Requires-Dist: pandas<3.0.0,>=1.1.0
34
- Requires-Dist: pydantic<2.0.0,>=1.8.2
34
+ Requires-Dist: pydantic<3.0.0,>1.0.0
35
35
  Requires-Dist: pyjwt>=2.8.0
36
36
  Requires-Dist: python-bidi==0.4.2
37
37
  Requires-Dist: python-dateutil>=2.8.0
@@ -39,9 +39,9 @@ dependencies = [
39
39
  "fastparquet>=0.8.1",
40
40
  "ipywidgets>=8.1.0",
41
41
  "lightgbm>=3.3.2",
42
- "numpy>=1.19.0",
42
+ "numpy>=1.19.0,<=1.26.4",
43
43
  "pandas>=1.1.0,<3.0.0",
44
- "pydantic>=1.8.2,<2.0.0",
44
+ "pydantic>1.0.0,<3.0.0",
45
45
  "pyjwt>=2.8.0",
46
46
  "python-dateutil>=2.8.0",
47
47
  "python-json-logger>=2.0.2",
@@ -79,7 +79,7 @@ python = "3.10"
79
79
  cov = 'pytest --cov-report=term-missing --cov-config=pyproject.toml --cov=upgini --cov=tests'
80
80
  format = "black {args}"
81
81
  lint = "ruff check {args}"
82
- test_binary = 'pytest -s -vv tests/test_binary_dataset.py'
82
+ test_all = 'pytest -s -vv tests'
83
83
 
84
84
  [[tool.hatch.envs.test.matrix]]
85
85
  python = ["3.8"]
@@ -103,7 +103,8 @@ dependencies = [
103
103
  # "pytest-timeout",
104
104
  "requests-mock",
105
105
  "pytest-datafiles",
106
- "pandas~={matrix:pandas}.0",
106
+ "pytest-xdist",
107
+ "pandas~={matrix:pandas}",
107
108
  ]
108
109
 
109
110
  [tool.black]
@@ -115,4 +116,5 @@ profile = "black"
115
116
  [tool.pytest.ini_options]
116
117
  pythonpath = [
117
118
  "./src"
118
- ]
119
+ ]
120
+ addopts="-n 4"
@@ -0,0 +1 @@
1
+ __version__ = "1.2.1"
@@ -9,32 +9,32 @@ from upgini.autofe.operand import PandasOperand, VectorizableMixin
9
9
 
10
10
 
11
11
  class Min(PandasOperand):
12
- name = "min"
13
- is_binary = True
14
- is_symmetrical = True
15
- has_symmetry_importance = True
12
+ name: str = "min"
13
+ is_binary: bool = True
14
+ is_symmetrical: bool = True
15
+ has_symmetry_importance: bool = True
16
16
 
17
17
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
18
18
  return np.minimum(left, right)
19
19
 
20
20
 
21
21
  class Max(PandasOperand):
22
- name = "max"
23
- is_binary = True
24
- is_symmetrical = True
25
- has_symmetry_importance = True
22
+ name: str = "max"
23
+ is_binary: bool = True
24
+ is_symmetrical: bool = True
25
+ has_symmetry_importance: bool = True
26
26
 
27
27
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
28
28
  return np.maximum(left, right)
29
29
 
30
30
 
31
31
  class Add(PandasOperand, VectorizableMixin):
32
- name = "+"
33
- alias = "add"
34
- is_binary = True
35
- is_symmetrical = True
36
- has_symmetry_importance = True
37
- is_vectorizable = True
32
+ name: str = "+"
33
+ alias: str = "add"
34
+ is_binary: bool = True
35
+ is_symmetrical: bool = True
36
+ has_symmetry_importance: bool = True
37
+ is_vectorizable: bool = True
38
38
 
39
39
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
40
40
  return left + right
@@ -48,12 +48,12 @@ class Add(PandasOperand, VectorizableMixin):
48
48
 
49
49
 
50
50
  class Subtract(PandasOperand, VectorizableMixin):
51
- name = "-"
52
- alias = "sub"
53
- is_binary = True
54
- is_symmetrical = True
55
- has_symmetry_importance = True
56
- is_vectorizable = True
51
+ name: str = "-"
52
+ alias: str = "sub"
53
+ is_binary: bool = True
54
+ is_symmetrical: bool = True
55
+ has_symmetry_importance: bool = True
56
+ is_vectorizable: bool = True
57
57
 
58
58
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
59
59
  return left - right
@@ -67,12 +67,12 @@ class Subtract(PandasOperand, VectorizableMixin):
67
67
 
68
68
 
69
69
  class Multiply(PandasOperand, VectorizableMixin):
70
- name = "*"
71
- alias = "mul"
72
- is_binary = True
73
- is_symmetrical = True
74
- has_symmetry_importance = True
75
- is_vectorizable = True
70
+ name: str = "*"
71
+ alias: str = "mul"
72
+ is_binary: bool = True
73
+ is_symmetrical: bool = True
74
+ has_symmetry_importance: bool = True
75
+ is_vectorizable: bool = True
76
76
 
77
77
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
78
78
  return left * right
@@ -86,12 +86,12 @@ class Multiply(PandasOperand, VectorizableMixin):
86
86
 
87
87
 
88
88
  class Divide(PandasOperand, VectorizableMixin):
89
- name = "/"
90
- alias = "div"
91
- is_binary = True
92
- has_symmetry_importance = True
93
- is_vectorizable = True
94
- output_type = "float"
89
+ name: str = "/"
90
+ alias: str = "div"
91
+ is_binary: bool = True
92
+ has_symmetry_importance: bool = True
93
+ is_vectorizable: bool = True
94
+ output_type: Optional[str] = "float"
95
95
 
96
96
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
97
97
  return left / right.replace(0, np.nan)
@@ -105,10 +105,10 @@ class Divide(PandasOperand, VectorizableMixin):
105
105
 
106
106
 
107
107
  class Combine(PandasOperand):
108
- name = "Combine"
109
- is_binary = True
110
- has_symmetry_importance = True
111
- output_type = "object"
108
+ name: str = "Combine"
109
+ is_binary: bool = True
110
+ has_symmetry_importance: bool = True
111
+ output_type: Optional[str] = "object"
112
112
 
113
113
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
114
114
  temp = left.astype(str) + "_" + right.astype(str)
@@ -117,13 +117,13 @@ class Combine(PandasOperand):
117
117
 
118
118
 
119
119
  class CombineThenFreq(PandasOperand):
120
- name = "CombineThenFreq"
121
- is_binary = True
122
- is_symmetrical = True
123
- has_symmetry_importance = True
124
- output_type = "float"
125
- is_distribution_dependent = True
126
- input_type = "discrete"
120
+ name: str = "CombineThenFreq"
121
+ is_binary: bool = True
122
+ is_symmetrical: bool = True
123
+ has_symmetry_importance: bool = True
124
+ output_type: Optional[str] = "float"
125
+ is_distribution_dependent: bool = True
126
+ input_type: Optional[str] = "discrete"
127
127
 
128
128
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
129
129
  temp = left.astype(str) + "_" + right.astype(str)
@@ -133,11 +133,11 @@ class CombineThenFreq(PandasOperand):
133
133
 
134
134
 
135
135
  class Distance(PandasOperand):
136
- name = "dist"
137
- is_binary = True
138
- output_type = "float"
139
- is_symmetrical = True
140
- has_symmetry_importance = True
136
+ name: str = "dist"
137
+ is_binary: bool = True
138
+ output_type: Optional[str] = "float"
139
+ is_symmetrical: bool = True
140
+ has_symmetry_importance: bool = True
141
141
 
142
142
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
143
143
  return pd.Series(
@@ -158,11 +158,11 @@ class Distance(PandasOperand):
158
158
 
159
159
  # Left for backward compatibility
160
160
  class Sim(Distance):
161
- name = "sim"
162
- is_binary = True
163
- output_type = "float"
164
- is_symmetrical = True
165
- has_symmetry_importance = True
161
+ name: str = "sim"
162
+ is_binary: bool = True
163
+ output_type: Optional[str] = "float"
164
+ is_symmetrical: bool = True
165
+ has_symmetry_importance: bool = True
166
166
 
167
167
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
168
168
  return 1 - super().calculate_binary(left, right)
@@ -191,12 +191,12 @@ class StringSim(PandasOperand, abc.ABC):
191
191
 
192
192
 
193
193
  class JaroWinklerSim1(StringSim):
194
- name = "sim_jw1"
195
- is_binary = True
196
- input_type = "string"
197
- output_type = "float"
198
- is_symmetrical = True
199
- has_symmetry_importance = True
194
+ name: str = "sim_jw1"
195
+ is_binary: bool = True
196
+ input_type: Optional[str] = "string"
197
+ output_type: Optional[str] = "float"
198
+ is_symmetrical: bool = True
199
+ has_symmetry_importance: bool = True
200
200
 
201
201
  def _prepare_value(self, value: Optional[str]) -> Optional[str]:
202
202
  return value
@@ -206,12 +206,12 @@ class JaroWinklerSim1(StringSim):
206
206
 
207
207
 
208
208
  class JaroWinklerSim2(StringSim):
209
- name = "sim_jw2"
210
- is_binary = True
211
- input_type = "string"
212
- output_type = "float"
213
- is_symmetrical = True
214
- has_symmetry_importance = True
209
+ name: str = "sim_jw2"
210
+ is_binary: bool = True
211
+ input_type: Optional[str] = "string"
212
+ output_type: Optional[str] = "float"
213
+ is_symmetrical: bool = True
214
+ has_symmetry_importance: bool = True
215
215
 
216
216
  def _prepare_value(self, value: Optional[str]) -> Optional[str]:
217
217
  return value[::-1] if value is not None else None
@@ -221,12 +221,12 @@ class JaroWinklerSim2(StringSim):
221
221
 
222
222
 
223
223
  class LevenshteinSim(StringSim):
224
- name = "sim_lv"
225
- is_binary = True
226
- input_type = "string"
227
- output_type = "float"
228
- is_symmetrical = True
229
- has_symmetry_importance = True
224
+ name: str = "sim_lv"
225
+ is_binary: bool = True
226
+ input_type: Optional[str] = "string"
227
+ output_type: Optional[str] = "float"
228
+ is_symmetrical: bool = True
229
+ has_symmetry_importance: bool = True
230
230
 
231
231
  def _prepare_value(self, value: Optional[str]) -> Optional[str]:
232
232
  return value
@@ -5,11 +5,16 @@ from typing import Any, Dict, List, Optional, Union
5
5
  import numpy as np
6
6
  import pandas as pd
7
7
  from pandas.core.arrays.timedeltas import TimedeltaArray
8
- from pydantic import BaseModel, validator
8
+ from pydantic import BaseModel, __version__ as pydantic_version
9
9
 
10
10
  from upgini.autofe.operand import PandasOperand
11
11
 
12
12
 
13
+ def get_pydantic_version():
14
+ major_version = int(pydantic_version.split('.')[0])
15
+ return major_version
16
+
17
+
13
18
  class DateDiffMixin(BaseModel):
14
19
  diff_unit: str = "D"
15
20
  left_unit: Optional[str] = None
@@ -39,10 +44,10 @@ class DateDiffMixin(BaseModel):
39
44
 
40
45
 
41
46
  class DateDiff(PandasOperand, DateDiffMixin):
42
- name = "date_diff"
43
- alias = "date_diff_type1"
44
- is_binary = True
45
- has_symmetry_importance = True
47
+ name: str = "date_diff"
48
+ alias: Optional[str] = "date_diff_type1"
49
+ is_binary: bool = True
50
+ has_symmetry_importance: bool = True
46
51
 
47
52
  replace_negative: bool = False
48
53
 
@@ -71,9 +76,9 @@ class DateDiff(PandasOperand, DateDiffMixin):
71
76
 
72
77
 
73
78
  class DateDiffType2(PandasOperand, DateDiffMixin):
74
- name = "date_diff_type2"
75
- is_binary = True
76
- has_symmetry_importance = True
79
+ name: str = "date_diff_type2"
80
+ is_binary: bool = True
81
+ has_symmetry_importance: bool = True
77
82
 
78
83
  def get_params(self) -> Dict[str, Optional[str]]:
79
84
  res = super().get_params()
@@ -105,8 +110,8 @@ _count_aggregations = ["nunique", "count"]
105
110
 
106
111
 
107
112
  class DateListDiff(PandasOperand, DateDiffMixin):
108
- is_binary = True
109
- has_symmetry_importance = True
113
+ is_binary: bool = True
114
+ has_symmetry_importance: bool = True
110
115
 
111
116
  aggregation: str
112
117
  replace_negative: bool = False
@@ -166,8 +171,8 @@ class DateListDiff(PandasOperand, DateDiffMixin):
166
171
 
167
172
 
168
173
  class DateListDiffBounded(DateListDiff):
169
- lower_bound: Optional[int]
170
- upper_bound: Optional[int]
174
+ lower_bound: Optional[int] = None
175
+ upper_bound: Optional[int] = None
171
176
 
172
177
  def __init__(self, **data: Any) -> None:
173
178
  if "name" not in data:
@@ -192,8 +197,8 @@ class DateListDiffBounded(DateListDiff):
192
197
 
193
198
 
194
199
  class DatePercentileBase(PandasOperand, abc.ABC):
195
- is_binary = True
196
- output_type = "float"
200
+ is_binary: bool = True
201
+ output_type: Optional[str] = "float"
197
202
 
198
203
  date_unit: Optional[str] = None
199
204
 
@@ -227,12 +232,12 @@ class DatePercentileBase(PandasOperand, abc.ABC):
227
232
 
228
233
 
229
234
  class DatePercentile(DatePercentileBase):
230
- name = "date_per"
231
- alias = "date_per_method1"
235
+ name: str = "date_per"
236
+ alias: Optional[str] = "date_per_method1"
232
237
 
233
- zero_month: Optional[int]
234
- zero_year: Optional[int]
235
- zero_bounds: Optional[List[float]]
238
+ zero_month: Optional[int] = None
239
+ zero_year: Optional[int] = None
240
+ zero_bounds: Optional[List[float]] = None
236
241
  step: int = 30
237
242
 
238
243
  def get_params(self) -> Dict[str, Optional[str]]:
@@ -247,12 +252,25 @@ class DatePercentile(DatePercentileBase):
247
252
  )
248
253
  return res
249
254
 
250
- @validator("zero_bounds", pre=True)
251
- def validate_bounds(cls, value):
252
- if value is None or isinstance(value, list):
255
+ # Check Pydantic version
256
+ if get_pydantic_version() >= 2:
257
+ # Use @field_validator for Pydantic 2.x
258
+ from pydantic import field_validator
259
+
260
+ @field_validator('zero_bounds', mode='before')
261
+ def parse_zero_bounds(cls, value):
262
+ if isinstance(value, str):
263
+ return json.loads(value)
264
+ return value
265
+ else:
266
+ # Use @validator for Pydantic 1.x
267
+ from pydantic import validator
268
+
269
+ @validator('zero_bounds', pre=True)
270
+ def parse_zero_bounds(cls, value):
271
+ if isinstance(value, str):
272
+ return json.loads(value)
253
273
  return value
254
- elif isinstance(value, str):
255
- return json.loads(value)
256
274
 
257
275
  def _get_bounds(self, date_col: pd.Series) -> pd.Series:
258
276
  months = date_col.dt.month
@@ -265,7 +283,7 @@ class DatePercentile(DatePercentileBase):
265
283
 
266
284
 
267
285
  class DatePercentileMethod2(DatePercentileBase):
268
- name = "date_per_method2"
286
+ name: str = "date_per_method2"
269
287
 
270
288
  def _get_bounds(self, date_col: pd.Series) -> pd.Series:
271
289
  pass
@@ -7,9 +7,9 @@ from upgini.autofe.operand import PandasOperand, VectorizableMixin
7
7
 
8
8
  class GroupByThenAgg(PandasOperand, VectorizableMixin):
9
9
  agg: Optional[str]
10
- is_vectorizable = True
11
- is_grouping = True
12
- is_distribution_dependent = True
10
+ is_vectorizable: bool = True
11
+ is_grouping: bool = True
12
+ is_distribution_dependent: bool = True
13
13
 
14
14
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
15
15
  temp = left.groupby(right).agg(self.agg)
@@ -24,17 +24,17 @@ class GroupByThenAgg(PandasOperand, VectorizableMixin):
24
24
 
25
25
 
26
26
  class GroupByThenMedian(GroupByThenAgg):
27
- name = "GroupByThenMedian"
28
- pandas_agg = "median"
29
- is_distribution_dependent = True
27
+ name: str = "GroupByThenMedian"
28
+ pandas_agg: str = "median"
29
+ is_distribution_dependent: bool = True
30
30
 
31
31
 
32
32
  class GroupByThenRank(PandasOperand, VectorizableMixin):
33
- name = "GroupByThenRank"
34
- is_vectorizable = True
35
- is_grouping = True
36
- output_type = "float"
37
- is_distribution_dependent = True
33
+ name: str = "GroupByThenRank"
34
+ is_vectorizable: bool = True
35
+ is_grouping: bool = True
36
+ output_type: Optional[str] = "float"
37
+ is_distribution_dependent: bool = True
38
38
 
39
39
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
40
40
  temp = pd.DataFrame(left[~right.isna()].groupby(right).rank(ascending=True, pct=True)).reset_index()
@@ -49,12 +49,12 @@ class GroupByThenRank(PandasOperand, VectorizableMixin):
49
49
 
50
50
 
51
51
  class GroupByThenNUnique(PandasOperand, VectorizableMixin):
52
- name = "GroupByThenNUnique"
53
- is_vectorizable = True
54
- is_grouping = True
55
- output_type = "int"
56
- is_distribution_dependent = True
57
- input_type = "discrete"
52
+ name: str = "GroupByThenNUnique"
53
+ is_vectorizable: bool = True
54
+ is_grouping: bool = True
55
+ output_type: Optional[str] = "int"
56
+ is_distribution_dependent: bool = True
57
+ input_type: Optional[str] = "discrete"
58
58
 
59
59
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
60
60
  nunique = left.groupby(right).nunique()
@@ -69,11 +69,11 @@ class GroupByThenNUnique(PandasOperand, VectorizableMixin):
69
69
 
70
70
 
71
71
  class GroupByThenFreq(PandasOperand):
72
- name = "GroupByThenFreq"
73
- is_grouping = True
74
- output_type = "float"
75
- is_distribution_dependent = True
76
- input_type = "discrete"
72
+ name: str = "GroupByThenFreq"
73
+ is_grouping: bool = True
74
+ output_type: Optional[str] = "float"
75
+ is_distribution_dependent: bool = True
76
+ input_type: Optional[str] = "discrete"
77
77
 
78
78
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
79
79
  def _f(x):
@@ -8,19 +8,19 @@ from pydantic import BaseModel
8
8
 
9
9
  class Operand(BaseModel):
10
10
  name: str
11
- alias: Optional[str]
11
+ alias: Optional[str] = None
12
12
  is_unary: bool = False
13
13
  is_symmetrical: bool = False
14
14
  has_symmetry_importance: bool = False
15
- input_type: Optional[str]
16
- output_type: Optional[str]
15
+ input_type: Optional[str] = None
16
+ output_type: Optional[str] = None
17
17
  is_categorical: bool = False
18
18
  is_vectorizable: bool = False
19
19
  is_grouping: bool = False
20
20
  is_binary: bool = False
21
21
  is_vector: bool = False
22
22
  is_distribution_dependent: bool = False
23
- params: Optional[Dict[str, str]]
23
+ params: Optional[Dict[str, str]] = None
24
24
 
25
25
  def set_params(self, params: Dict[str, str]):
26
26
  self.params = params