upgini 1.1.280.dev0__py3-none-any.whl → 1.2.31a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

Files changed (43) hide show
  1. upgini/__about__.py +1 -1
  2. upgini/__init__.py +4 -20
  3. upgini/autofe/all_operands.py +39 -9
  4. upgini/autofe/binary.py +148 -45
  5. upgini/autofe/date.py +197 -26
  6. upgini/autofe/feature.py +102 -19
  7. upgini/autofe/groupby.py +22 -22
  8. upgini/autofe/operand.py +9 -6
  9. upgini/autofe/unary.py +83 -41
  10. upgini/autofe/vector.py +8 -8
  11. upgini/data_source/data_source_publisher.py +128 -5
  12. upgini/dataset.py +50 -386
  13. upgini/features_enricher.py +931 -542
  14. upgini/http.py +27 -16
  15. upgini/lazy_import.py +35 -0
  16. upgini/metadata.py +84 -59
  17. upgini/metrics.py +164 -34
  18. upgini/normalizer/normalize_utils.py +197 -0
  19. upgini/resource_bundle/strings.properties +66 -51
  20. upgini/search_task.py +10 -4
  21. upgini/utils/Roboto-Regular.ttf +0 -0
  22. upgini/utils/base_search_key_detector.py +14 -12
  23. upgini/utils/country_utils.py +16 -0
  24. upgini/utils/custom_loss_utils.py +39 -36
  25. upgini/utils/datetime_utils.py +98 -45
  26. upgini/utils/deduplicate_utils.py +135 -112
  27. upgini/utils/display_utils.py +46 -15
  28. upgini/utils/email_utils.py +54 -16
  29. upgini/utils/feature_info.py +172 -0
  30. upgini/utils/features_validator.py +34 -20
  31. upgini/utils/ip_utils.py +100 -1
  32. upgini/utils/phone_utils.py +343 -0
  33. upgini/utils/postal_code_utils.py +34 -0
  34. upgini/utils/sklearn_ext.py +28 -19
  35. upgini/utils/target_utils.py +113 -57
  36. upgini/utils/warning_counter.py +1 -0
  37. upgini/version_validator.py +8 -4
  38. {upgini-1.1.280.dev0.dist-info → upgini-1.2.31a1.dist-info}/METADATA +31 -16
  39. upgini-1.2.31a1.dist-info/RECORD +65 -0
  40. upgini/normalizer/phone_normalizer.py +0 -340
  41. upgini-1.1.280.dev0.dist-info/RECORD +0 -62
  42. {upgini-1.1.280.dev0.dist-info → upgini-1.2.31a1.dist-info}/WHEEL +0 -0
  43. {upgini-1.1.280.dev0.dist-info → upgini-1.2.31a1.dist-info}/licenses/LICENSE +0 -0
upgini/__about__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "1.1.280.dev0"
1
+ __version__ = "1.2.31a1"
upgini/__init__.py CHANGED
@@ -1,21 +1,5 @@
1
- from typing import List
1
+ from upgini.features_enricher import FeaturesEnricher # noqa: F401
2
+ from upgini.metadata import SearchKey, CVType, RuntimeParameters, ModelTaskType # noqa: F401
3
+ import warnings
2
4
 
3
- from .dataset import Dataset
4
- from .features_enricher import FeaturesEnricher # noqa: F401
5
- from .metadata import ( # noqa: F401
6
- FileColumnMeaningType,
7
- FileMetrics,
8
- ModelTaskType,
9
- SearchKey,
10
- )
11
- from .search_task import SearchTask
12
-
13
-
14
- def search_history() -> List[SearchTask]:
15
- # TODO
16
- return []
17
-
18
-
19
- def datasets_history() -> List[Dataset]:
20
- # TODO
21
- return []
5
+ warnings.filterwarnings("ignore", category=UserWarning, module="_distutils_hack")
@@ -1,10 +1,32 @@
1
+ from copy import deepcopy
1
2
  from typing import Dict
2
3
 
3
- from upgini.autofe.binary import Add, Divide, Max, Min, Multiply, Sim, Subtract
4
- from upgini.autofe.date import DateDiff, DateDiffType2, DateListDiff, DateListDiffBounded
5
- from upgini.autofe.groupby import GroupByThenAgg, GroupByThenRank
4
+ from upgini.autofe.binary import (
5
+ Add,
6
+ Combine,
7
+ CombineThenFreq,
8
+ Distance,
9
+ Divide,
10
+ JaroWinklerSim1,
11
+ JaroWinklerSim2,
12
+ LevenshteinSim,
13
+ Max,
14
+ Min,
15
+ Multiply,
16
+ Sim,
17
+ Subtract,
18
+ )
19
+ from upgini.autofe.date import (
20
+ DateDiff,
21
+ DateDiffType2,
22
+ DateListDiff,
23
+ DateListDiffBounded,
24
+ DatePercentile,
25
+ DatePercentileMethod2,
26
+ )
27
+ from upgini.autofe.groupby import GroupByThenAgg, GroupByThenFreq, GroupByThenNUnique, GroupByThenRank
6
28
  from upgini.autofe.operand import Operand
7
- from upgini.autofe.unary import Abs, Floor, Freq, Log, Residual, Sigmoid, Sqrt, Square
29
+ from upgini.autofe.unary import Abs, Embeddings, Floor, Freq, Log, Residual, Norm, Sigmoid, Sqrt, Square
8
30
  from upgini.autofe.vector import Mean, Sum
9
31
 
10
32
  ALL_OPERANDS: Dict[str, Operand] = {
@@ -32,10 +54,10 @@ ALL_OPERANDS: Dict[str, Operand] = {
32
54
  GroupByThenAgg(name="GroupByThenMedian", agg="median"),
33
55
  GroupByThenAgg(name="GroupByThenStd", output_type="float", agg="std"),
34
56
  GroupByThenRank(),
35
- Operand(name="Combine", has_symmetry_importance=True, output_type="object", is_categorical=True),
36
- Operand(name="CombineThenFreq", has_symmetry_importance=True, output_type="float"),
37
- Operand(name="GroupByThenNUnique", output_type="int", is_vectorizable=True, is_grouping=True),
38
- Operand(name="GroupByThenFreq", output_type="float", is_grouping=True),
57
+ Combine(),
58
+ CombineThenFreq(),
59
+ GroupByThenNUnique(),
60
+ GroupByThenFreq(),
39
61
  Sim(),
40
62
  DateDiff(),
41
63
  DateDiffType2(),
@@ -49,9 +71,17 @@ ALL_OPERANDS: Dict[str, Operand] = {
49
71
  DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=30, upper_bound=45),
50
72
  DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=45, upper_bound=60),
51
73
  DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=60),
74
+ DatePercentile(),
75
+ DatePercentileMethod2(),
76
+ Norm(),
77
+ JaroWinklerSim1(),
78
+ JaroWinklerSim2(),
79
+ LevenshteinSim(),
80
+ Distance(),
81
+ Embeddings(),
52
82
  ]
53
83
  }
54
84
 
55
85
 
56
86
  def find_op(name):
57
- return ALL_OPERANDS.get(name)
87
+ return deepcopy(ALL_OPERANDS.get(name))
upgini/autofe/binary.py CHANGED
@@ -1,35 +1,40 @@
1
+ import abc
2
+ from typing import Optional
3
+ import Levenshtein
1
4
  import numpy as np
2
5
  import pandas as pd
3
- from numpy import dot
4
- from numpy.linalg import norm
6
+ from jarowinkler import jarowinkler_similarity
5
7
 
6
8
  from upgini.autofe.operand import PandasOperand, VectorizableMixin
7
9
 
8
10
 
9
11
  class Min(PandasOperand):
10
- name = "min"
11
- is_binary = True
12
- has_symmetry_importance = True
12
+ name: str = "min"
13
+ is_binary: bool = True
14
+ is_symmetrical: bool = True
15
+ has_symmetry_importance: bool = True
13
16
 
14
17
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
15
18
  return np.minimum(left, right)
16
19
 
17
20
 
18
21
  class Max(PandasOperand):
19
- name = "max"
20
- is_binary = True
21
- has_symmetry_importance = True
22
+ name: str = "max"
23
+ is_binary: bool = True
24
+ is_symmetrical: bool = True
25
+ has_symmetry_importance: bool = True
22
26
 
23
27
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
24
28
  return np.maximum(left, right)
25
29
 
26
30
 
27
31
  class Add(PandasOperand, VectorizableMixin):
28
- name = "+"
29
- alias = "add"
30
- is_binary = True
31
- has_symmetry_importance = True
32
- is_vectorizable = True
32
+ name: str = "+"
33
+ alias: str = "add"
34
+ is_binary: bool = True
35
+ is_symmetrical: bool = True
36
+ has_symmetry_importance: bool = True
37
+ is_vectorizable: bool = True
33
38
 
34
39
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
35
40
  return left + right
@@ -43,11 +48,12 @@ class Add(PandasOperand, VectorizableMixin):
43
48
 
44
49
 
45
50
  class Subtract(PandasOperand, VectorizableMixin):
46
- name = "-"
47
- alias = "sub"
48
- is_binary = True
49
- has_symmetry_importance = True
50
- is_vectorizable = True
51
+ name: str = "-"
52
+ alias: str = "sub"
53
+ is_binary: bool = True
54
+ is_symmetrical: bool = True
55
+ has_symmetry_importance: bool = True
56
+ is_vectorizable: bool = True
51
57
 
52
58
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
53
59
  return left - right
@@ -61,11 +67,12 @@ class Subtract(PandasOperand, VectorizableMixin):
61
67
 
62
68
 
63
69
  class Multiply(PandasOperand, VectorizableMixin):
64
- name = "*"
65
- alias = "mul"
66
- is_binary = True
67
- has_symmetry_importance = True
68
- is_vectorizable = True
70
+ name: str = "*"
71
+ alias: str = "mul"
72
+ is_binary: bool = True
73
+ is_symmetrical: bool = True
74
+ has_symmetry_importance: bool = True
75
+ is_vectorizable: bool = True
69
76
 
70
77
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
71
78
  return left * right
@@ -79,12 +86,12 @@ class Multiply(PandasOperand, VectorizableMixin):
79
86
 
80
87
 
81
88
  class Divide(PandasOperand, VectorizableMixin):
82
- name = "/"
83
- alias = "div"
84
- is_binary = True
85
- has_symmetry_importance = True
86
- is_vectorizable = True
87
- output_type = "float"
89
+ name: str = "/"
90
+ alias: str = "div"
91
+ is_binary: bool = True
92
+ has_symmetry_importance: bool = True
93
+ is_vectorizable: bool = True
94
+ output_type: Optional[str] = "float"
88
95
 
89
96
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
90
97
  return left / right.replace(0, np.nan)
@@ -98,10 +105,10 @@ class Divide(PandasOperand, VectorizableMixin):
98
105
 
99
106
 
100
107
  class Combine(PandasOperand):
101
- name = "Combine"
102
- is_binary = True
103
- has_symmetry_importance = True
104
- output_type = "object"
108
+ name: str = "Combine"
109
+ is_binary: bool = True
110
+ has_symmetry_importance: bool = True
111
+ output_type: Optional[str] = "object"
105
112
 
106
113
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
107
114
  temp = left.astype(str) + "_" + right.astype(str)
@@ -110,12 +117,13 @@ class Combine(PandasOperand):
110
117
 
111
118
 
112
119
  class CombineThenFreq(PandasOperand):
113
- name = "CombineThenFreq"
114
- is_binary = True
115
- has_symmetry_importance = True
116
- output_type = "float"
117
- is_distribution_dependent = True
118
- input_type = "discrete"
120
+ name: str = "CombineThenFreq"
121
+ is_binary: bool = True
122
+ is_symmetrical: bool = True
123
+ has_symmetry_importance: bool = True
124
+ output_type: Optional[str] = "float"
125
+ is_distribution_dependent: bool = True
126
+ input_type: Optional[str] = "discrete"
119
127
 
120
128
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
121
129
  temp = left.astype(str) + "_" + right.astype(str)
@@ -124,11 +132,106 @@ class CombineThenFreq(PandasOperand):
124
132
  self._loc(temp, value_counts)
125
133
 
126
134
 
127
- class Sim(PandasOperand):
128
- name = "sim"
129
- is_binary = True
130
- output_type = "float"
131
- has_symmetry_importance = True
135
+ class Distance(PandasOperand):
136
+ name: str = "dist"
137
+ is_binary: bool = True
138
+ output_type: Optional[str] = "float"
139
+ is_symmetrical: bool = True
140
+ has_symmetry_importance: bool = True
132
141
 
133
142
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
134
- return dot(left, right) / (norm(left) * norm(right))
143
+ return pd.Series(
144
+ 1 - self.__dot(left, right) / (self.__norm(left) * self.__norm(right)), index=left.index
145
+ ).astype(np.float64)
146
+
147
+ # row-wise dot product, handling None values
148
+ def __dot(self, left: pd.Series, right: pd.Series) -> pd.Series:
149
+ left = left.apply(lambda x: np.array(x))
150
+ right = right.apply(lambda x: np.array(x))
151
+ res = (left.dropna() * right.dropna()).apply(np.sum)
152
+ res = res.reindex(left.index.union(right.index))
153
+ return res
154
+
155
+ # Calculate the norm of a vector, handling None values
156
+ def __norm(self, vector: pd.Series) -> pd.Series:
157
+ vector = vector.fillna(np.nan)
158
+ return np.sqrt(self.__dot(vector, vector))
159
+
160
+
161
+ # Left for backward compatibility
162
+ class Sim(Distance):
163
+ name: str = "sim"
164
+ is_binary: bool = True
165
+ output_type: Optional[str] = "float"
166
+ is_symmetrical: bool = True
167
+ has_symmetry_importance: bool = True
168
+
169
+ def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
170
+ return 1 - super().calculate_binary(left, right)
171
+
172
+
173
+ class StringSim(PandasOperand, abc.ABC):
174
+ def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
175
+ sims = []
176
+ for i in left.index:
177
+ left_i = self._prepare_value(left.get(i))
178
+ right_i = self._prepare_value(right.get(i))
179
+ if left_i is not None and right_i is not None:
180
+ sims.append(self._similarity(left_i, right_i))
181
+ else:
182
+ sims.append(None)
183
+
184
+ return pd.Series(sims, index=left.index)
185
+
186
+ @abc.abstractmethod
187
+ def _prepare_value(self, value: Optional[str]) -> Optional[str]:
188
+ pass
189
+
190
+ @abc.abstractmethod
191
+ def _similarity(self, left: str, right: str) -> float:
192
+ pass
193
+
194
+
195
+ class JaroWinklerSim1(StringSim):
196
+ name: str = "sim_jw1"
197
+ is_binary: bool = True
198
+ input_type: Optional[str] = "string"
199
+ output_type: Optional[str] = "float"
200
+ is_symmetrical: bool = True
201
+ has_symmetry_importance: bool = True
202
+
203
+ def _prepare_value(self, value: Optional[str]) -> Optional[str]:
204
+ return value
205
+
206
+ def _similarity(self, left: str, right: str) -> float:
207
+ return jarowinkler_similarity(left, right)
208
+
209
+
210
+ class JaroWinklerSim2(StringSim):
211
+ name: str = "sim_jw2"
212
+ is_binary: bool = True
213
+ input_type: Optional[str] = "string"
214
+ output_type: Optional[str] = "float"
215
+ is_symmetrical: bool = True
216
+ has_symmetry_importance: bool = True
217
+
218
+ def _prepare_value(self, value: Optional[str]) -> Optional[str]:
219
+ return value[::-1] if value is not None else None
220
+
221
+ def _similarity(self, left: str, right: str) -> float:
222
+ return jarowinkler_similarity(left, right)
223
+
224
+
225
+ class LevenshteinSim(StringSim):
226
+ name: str = "sim_lv"
227
+ is_binary: bool = True
228
+ input_type: Optional[str] = "string"
229
+ output_type: Optional[str] = "float"
230
+ is_symmetrical: bool = True
231
+ has_symmetry_importance: bool = True
232
+
233
+ def _prepare_value(self, value: Optional[str]) -> Optional[str]:
234
+ return value
235
+
236
+ def _similarity(self, left: str, right: str) -> float:
237
+ return 1 - Levenshtein.distance(left, right) / max(len(left), len(right))
upgini/autofe/date.py CHANGED
@@ -1,13 +1,20 @@
1
- from typing import Any, Optional, Union
1
+ import abc
2
+ import json
3
+ from typing import Any, Dict, List, Optional, Union
2
4
 
3
5
  import numpy as np
4
6
  import pandas as pd
5
7
  from pandas.core.arrays.timedeltas import TimedeltaArray
6
- from pydantic import BaseModel
8
+ from pydantic import BaseModel, __version__ as pydantic_version
7
9
 
8
10
  from upgini.autofe.operand import PandasOperand
9
11
 
10
12
 
13
+ def get_pydantic_version():
14
+ major_version = int(pydantic_version.split('.')[0])
15
+ return major_version
16
+
17
+
11
18
  class DateDiffMixin(BaseModel):
12
19
  diff_unit: str = "D"
13
20
  left_unit: Optional[str] = None
@@ -19,34 +26,76 @@ class DateDiffMixin(BaseModel):
19
26
  if isinstance(x, pd.DataFrame):
20
27
  return x.apply(lambda y: self._convert_to_date(y, unit), axis=1)
21
28
 
22
- return pd.to_datetime(x, unit=unit)
29
+ return pd.to_datetime(x, unit=unit, errors="coerce")
30
+
31
+ def _convert_diff_to_unit(self, diff: Union[pd.Series, TimedeltaArray]) -> Union[pd.Series, TimedeltaArray]:
32
+ if self.diff_unit == "D":
33
+ if isinstance(diff, pd.Series) and diff.dtype == "object":
34
+ return diff.apply(lambda x: None if isinstance(x, float) and np.isnan(x) else x.days)
35
+ else:
36
+ return diff / np.timedelta64(1, self.diff_unit)
37
+ elif self.diff_unit == "Y":
38
+ if isinstance(diff, TimedeltaArray):
39
+ return (diff / 365 / 24 / 60 / 60 / 10**9).astype(int)
40
+ else:
41
+ return (diff / 365 / 24 / 60 / 60 / 10**9).dt.nanoseconds
42
+ else:
43
+ raise Exception(f"Unsupported difference unit: {self.diff_unit}")
23
44
 
24
45
 
25
46
  class DateDiff(PandasOperand, DateDiffMixin):
26
- name = "date_diff"
27
- is_binary = True
28
- has_symmetry_importance = True
47
+ name: str = "date_diff"
48
+ alias: Optional[str] = "date_diff_type1"
49
+ is_binary: bool = True
50
+ has_symmetry_importance: bool = True
51
+
52
+ replace_negative: bool = False
53
+
54
+ def get_params(self) -> Dict[str, Optional[str]]:
55
+ res = super().get_params()
56
+ res.update(
57
+ {
58
+ "diff_unit": self.diff_unit,
59
+ "left_unit": self.left_unit,
60
+ "right_unit": self.right_unit,
61
+ "replace_negative": self.replace_negative,
62
+ }
63
+ )
64
+ return res
29
65
 
30
66
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
31
67
  left = self._convert_to_date(left, self.left_unit)
32
68
  right = self._convert_to_date(right, self.right_unit)
33
- return self.__replace_negative((left - right) / np.timedelta64(1, self.diff_unit))
69
+ diff = self._convert_diff_to_unit(left.dt.date - right.dt.date)
70
+ return self.__replace_negative(diff)
34
71
 
35
72
  def __replace_negative(self, x: Union[pd.DataFrame, pd.Series]):
36
- x[x < 0] = None
73
+ if self.replace_negative:
74
+ x[x < 0] = None
37
75
  return x
38
76
 
39
77
 
40
78
  class DateDiffType2(PandasOperand, DateDiffMixin):
41
- name = "date_diff_type2"
42
- is_binary = True
43
- has_symmetry_importance = True
79
+ name: str = "date_diff_type2"
80
+ is_binary: bool = True
81
+ has_symmetry_importance: bool = True
82
+
83
+ def get_params(self) -> Dict[str, Optional[str]]:
84
+ res = super().get_params()
85
+ res.update(
86
+ {
87
+ "diff_unit": self.diff_unit,
88
+ "left_unit": self.left_unit,
89
+ "right_unit": self.right_unit,
90
+ }
91
+ )
92
+ return res
44
93
 
45
94
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
46
95
  left = self._convert_to_date(left, self.left_unit)
47
96
  right = self._convert_to_date(right, self.right_unit)
48
97
  future = right + (left.dt.year - right.dt.year).apply(
49
- lambda y: np.datetime64("NaT") if np.isnan(y) else pd.tseries.offsets.DateOffset(years=y)
98
+ lambda y: pd.tseries.offsets.DateOffset(years=0 if np.isnan(y) else y)
50
99
  )
51
100
  future = pd.to_datetime(future)
52
101
  before = future[future < left]
@@ -57,12 +106,28 @@ class DateDiffType2(PandasOperand, DateDiffMixin):
57
106
 
58
107
 
59
108
  _ext_aggregations = {"nunique": (lambda x: len(np.unique(x)), 0), "count": (len, 0)}
109
+ _count_aggregations = ["nunique", "count"]
60
110
 
61
111
 
62
112
  class DateListDiff(PandasOperand, DateDiffMixin):
63
- is_binary = True
64
- has_symmetry_importance = True
113
+ is_binary: bool = True
114
+ has_symmetry_importance: bool = True
115
+
65
116
  aggregation: str
117
+ replace_negative: bool = False
118
+
119
+ def get_params(self) -> Dict[str, Optional[str]]:
120
+ res = super().get_params()
121
+ res.update(
122
+ {
123
+ "aggregation": self.aggregation,
124
+ "diff_unit": self.diff_unit,
125
+ "left_unit": self.left_unit,
126
+ "right_unit": self.right_unit,
127
+ "replace_negative": self.replace_negative,
128
+ }
129
+ )
130
+ return res
66
131
 
67
132
  def __init__(self, **data: Any) -> None:
68
133
  if "name" not in data:
@@ -71,18 +136,28 @@ class DateListDiff(PandasOperand, DateDiffMixin):
71
136
 
72
137
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
73
138
  left = self._convert_to_date(left, self.left_unit)
74
- right = right.apply(lambda x: pd.arrays.DatetimeArray(self._convert_to_date(x, self.right_unit)))
139
+ right_mask = right.apply(lambda x: len(x) > 0)
140
+ mask = left.notna() & right.notna() & right_mask
141
+ right_masked = right[mask].apply(lambda x: pd.arrays.DatetimeArray(self._convert_to_date(x, self.right_unit)))
142
+
143
+ if len(right_masked) == 0:
144
+ diff = []
145
+ elif len(right_masked) < 2:
146
+ diff = [left[mask].iloc[0] - right_masked.iloc[0]]
147
+ else:
148
+ diff = left[mask] - right_masked.values
75
149
 
76
- return pd.Series(left - right.values).apply(lambda x: self._agg(self._diff(x)))
150
+ res_masked = pd.Series(diff, index=left[mask].index).apply(lambda x: self._agg(self._diff(x)))
151
+ res = res_masked.reindex(left.index.union(right.index))
152
+ if self.aggregation in _count_aggregations:
153
+ res[~right_mask] = 0.0
154
+ res = res.astype(np.float64)
155
+
156
+ return res
77
157
 
78
158
  def _diff(self, x: TimedeltaArray):
79
- if self.diff_unit == "Y":
80
- x = (x / 365 / 24 / 60 / 60 / 10**9).astype(int)
81
- elif self.diff_unit == "M":
82
- raise Exception("Unsupported difference unit: Month")
83
- else:
84
- x = x / np.timedelta64(1, self.diff_unit)
85
- return x[x > 0]
159
+ x = self._convert_diff_to_unit(x)
160
+ return x[x > 0] if self.replace_negative else x
86
161
 
87
162
  def _agg(self, x):
88
163
  method = getattr(np, self.aggregation, None)
@@ -96,8 +171,8 @@ class DateListDiff(PandasOperand, DateDiffMixin):
96
171
 
97
172
 
98
173
  class DateListDiffBounded(DateListDiff):
99
- lower_bound: Optional[int]
100
- upper_bound: Optional[int]
174
+ lower_bound: Optional[int] = None
175
+ upper_bound: Optional[int] = None
101
176
 
102
177
  def __init__(self, **data: Any) -> None:
103
178
  if "name" not in data:
@@ -114,5 +189,101 @@ class DateListDiffBounded(DateListDiff):
114
189
  super().__init__(**data)
115
190
 
116
191
  def _agg(self, x):
117
- x = x[(x >= (self.lower_bound or -np.inf)) & (x < (self.upper_bound or np.inf))]
192
+ x = x[
193
+ (x >= (self.lower_bound if self.lower_bound is not None else -np.inf))
194
+ & (x < (self.upper_bound if self.upper_bound is not None else np.inf))
195
+ ]
118
196
  return super()._agg(x)
197
+
198
+
199
+ class DatePercentileBase(PandasOperand, abc.ABC):
200
+ is_binary: bool = True
201
+ output_type: Optional[str] = "float"
202
+
203
+ date_unit: Optional[str] = None
204
+
205
+ def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
206
+ # Assuming that left is a date column, right is a feature column
207
+ left = pd.to_datetime(left, unit=self.date_unit)
208
+
209
+ bounds = self._get_bounds(left)
210
+
211
+ return right.index.to_series().apply(lambda i: self._perc(right[i], bounds[i]))
212
+
213
+ @abc.abstractmethod
214
+ def _get_bounds(self, date_col: pd.Series) -> pd.Series:
215
+ pass
216
+
217
+ def _perc(self, f, bounds):
218
+ hit = np.where(f >= np.array(bounds))[0]
219
+ if hit.size > 0:
220
+ return np.max(hit) + 1
221
+ else:
222
+ return np.nan
223
+
224
+ def get_params(self) -> Dict[str, Optional[str]]:
225
+ res = super().get_params()
226
+ res.update(
227
+ {
228
+ "date_unit": self.date_unit,
229
+ }
230
+ )
231
+ return res
232
+
233
+
234
+ class DatePercentile(DatePercentileBase):
235
+ name: str = "date_per"
236
+ alias: Optional[str] = "date_per_method1"
237
+
238
+ zero_month: Optional[int] = None
239
+ zero_year: Optional[int] = None
240
+ zero_bounds: Optional[List[float]] = None
241
+ step: int = 30
242
+
243
+ def get_params(self) -> Dict[str, Optional[str]]:
244
+ res = super().get_params()
245
+ res.update(
246
+ {
247
+ "zero_month": self.zero_month,
248
+ "zero_year": self.zero_year,
249
+ "zero_bounds": self.zero_bounds,
250
+ "step": self.step,
251
+ }
252
+ )
253
+ return res
254
+
255
+ # Check Pydantic version
256
+ if get_pydantic_version() >= 2:
257
+ # Use @field_validator for Pydantic 2.x
258
+ from pydantic import field_validator
259
+
260
+ @field_validator('zero_bounds', mode='before')
261
+ def parse_zero_bounds(cls, value):
262
+ if isinstance(value, str):
263
+ return json.loads(value)
264
+ return value
265
+ else:
266
+ # Use @validator for Pydantic 1.x
267
+ from pydantic import validator
268
+
269
+ @validator('zero_bounds', pre=True)
270
+ def parse_zero_bounds(cls, value):
271
+ if isinstance(value, str):
272
+ return json.loads(value)
273
+ return value
274
+
275
+ def _get_bounds(self, date_col: pd.Series) -> pd.Series:
276
+ months = date_col.dt.month
277
+ years = date_col.dt.year
278
+
279
+ month_diffs = 12 * (years - (self.zero_year or 0)) + (months - (self.zero_month or 0))
280
+ return month_diffs.apply(
281
+ lambda d: np.array(self.zero_bounds if self.zero_bounds is not None else []) + d * self.step
282
+ )
283
+
284
+
285
+ class DatePercentileMethod2(DatePercentileBase):
286
+ name: str = "date_per_method2"
287
+
288
+ def _get_bounds(self, date_col: pd.Series) -> pd.Series:
289
+ pass