upgini 1.1.280a3418.post2__py3-none-any.whl → 1.2.31__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/__about__.py +1 -1
- upgini/__init__.py +4 -20
- upgini/autofe/all_operands.py +39 -10
- upgini/autofe/binary.py +148 -45
- upgini/autofe/date.py +197 -26
- upgini/autofe/feature.py +102 -19
- upgini/autofe/groupby.py +22 -22
- upgini/autofe/operand.py +9 -6
- upgini/autofe/unary.py +78 -54
- upgini/autofe/vector.py +8 -8
- upgini/data_source/data_source_publisher.py +128 -5
- upgini/dataset.py +50 -386
- upgini/features_enricher.py +936 -541
- upgini/http.py +27 -16
- upgini/lazy_import.py +35 -0
- upgini/metadata.py +84 -59
- upgini/metrics.py +164 -34
- upgini/normalizer/normalize_utils.py +197 -0
- upgini/resource_bundle/strings.properties +66 -51
- upgini/search_task.py +10 -4
- upgini/utils/Roboto-Regular.ttf +0 -0
- upgini/utils/base_search_key_detector.py +14 -12
- upgini/utils/country_utils.py +16 -0
- upgini/utils/custom_loss_utils.py +39 -36
- upgini/utils/datetime_utils.py +98 -45
- upgini/utils/deduplicate_utils.py +135 -112
- upgini/utils/display_utils.py +46 -15
- upgini/utils/email_utils.py +54 -16
- upgini/utils/feature_info.py +172 -0
- upgini/utils/features_validator.py +34 -20
- upgini/utils/ip_utils.py +100 -1
- upgini/utils/phone_utils.py +343 -0
- upgini/utils/postal_code_utils.py +34 -0
- upgini/utils/sklearn_ext.py +28 -19
- upgini/utils/target_utils.py +113 -57
- upgini/utils/warning_counter.py +1 -0
- upgini/version_validator.py +8 -4
- {upgini-1.1.280a3418.post2.dist-info → upgini-1.2.31.dist-info}/METADATA +31 -16
- upgini-1.2.31.dist-info/RECORD +65 -0
- upgini/normalizer/phone_normalizer.py +0 -340
- upgini-1.1.280a3418.post2.dist-info/RECORD +0 -62
- {upgini-1.1.280a3418.post2.dist-info → upgini-1.2.31.dist-info}/WHEEL +0 -0
- {upgini-1.1.280a3418.post2.dist-info → upgini-1.2.31.dist-info}/licenses/LICENSE +0 -0
upgini/__about__.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "1.
|
|
1
|
+
__version__ = "1.2.31"
|
upgini/__init__.py
CHANGED
|
@@ -1,21 +1,5 @@
|
|
|
1
|
-
from
|
|
1
|
+
from upgini.features_enricher import FeaturesEnricher # noqa: F401
|
|
2
|
+
from upgini.metadata import SearchKey, CVType, RuntimeParameters, ModelTaskType # noqa: F401
|
|
3
|
+
import warnings
|
|
2
4
|
|
|
3
|
-
|
|
4
|
-
from .features_enricher import FeaturesEnricher # noqa: F401
|
|
5
|
-
from .metadata import ( # noqa: F401
|
|
6
|
-
FileColumnMeaningType,
|
|
7
|
-
FileMetrics,
|
|
8
|
-
ModelTaskType,
|
|
9
|
-
SearchKey,
|
|
10
|
-
)
|
|
11
|
-
from .search_task import SearchTask
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
def search_history() -> List[SearchTask]:
|
|
15
|
-
# TODO
|
|
16
|
-
return []
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
def datasets_history() -> List[Dataset]:
|
|
20
|
-
# TODO
|
|
21
|
-
return []
|
|
5
|
+
warnings.filterwarnings("ignore", category=UserWarning, module="_distutils_hack")
|
upgini/autofe/all_operands.py
CHANGED
|
@@ -1,10 +1,32 @@
|
|
|
1
|
+
from copy import deepcopy
|
|
1
2
|
from typing import Dict
|
|
2
3
|
|
|
3
|
-
from upgini.autofe.binary import
|
|
4
|
-
|
|
5
|
-
|
|
4
|
+
from upgini.autofe.binary import (
|
|
5
|
+
Add,
|
|
6
|
+
Combine,
|
|
7
|
+
CombineThenFreq,
|
|
8
|
+
Distance,
|
|
9
|
+
Divide,
|
|
10
|
+
JaroWinklerSim1,
|
|
11
|
+
JaroWinklerSim2,
|
|
12
|
+
LevenshteinSim,
|
|
13
|
+
Max,
|
|
14
|
+
Min,
|
|
15
|
+
Multiply,
|
|
16
|
+
Sim,
|
|
17
|
+
Subtract,
|
|
18
|
+
)
|
|
19
|
+
from upgini.autofe.date import (
|
|
20
|
+
DateDiff,
|
|
21
|
+
DateDiffType2,
|
|
22
|
+
DateListDiff,
|
|
23
|
+
DateListDiffBounded,
|
|
24
|
+
DatePercentile,
|
|
25
|
+
DatePercentileMethod2,
|
|
26
|
+
)
|
|
27
|
+
from upgini.autofe.groupby import GroupByThenAgg, GroupByThenFreq, GroupByThenNUnique, GroupByThenRank
|
|
6
28
|
from upgini.autofe.operand import Operand
|
|
7
|
-
from upgini.autofe.unary import Abs,
|
|
29
|
+
from upgini.autofe.unary import Abs, Embeddings, Floor, Freq, Log, Residual, Norm, Sigmoid, Sqrt, Square
|
|
8
30
|
from upgini.autofe.vector import Mean, Sum
|
|
9
31
|
|
|
10
32
|
ALL_OPERANDS: Dict[str, Operand] = {
|
|
@@ -32,10 +54,10 @@ ALL_OPERANDS: Dict[str, Operand] = {
|
|
|
32
54
|
GroupByThenAgg(name="GroupByThenMedian", agg="median"),
|
|
33
55
|
GroupByThenAgg(name="GroupByThenStd", output_type="float", agg="std"),
|
|
34
56
|
GroupByThenRank(),
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
57
|
+
Combine(),
|
|
58
|
+
CombineThenFreq(),
|
|
59
|
+
GroupByThenNUnique(),
|
|
60
|
+
GroupByThenFreq(),
|
|
39
61
|
Sim(),
|
|
40
62
|
DateDiff(),
|
|
41
63
|
DateDiffType2(),
|
|
@@ -49,10 +71,17 @@ ALL_OPERANDS: Dict[str, Operand] = {
|
|
|
49
71
|
DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=30, upper_bound=45),
|
|
50
72
|
DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=45, upper_bound=60),
|
|
51
73
|
DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=60),
|
|
52
|
-
|
|
74
|
+
DatePercentile(),
|
|
75
|
+
DatePercentileMethod2(),
|
|
76
|
+
Norm(),
|
|
77
|
+
JaroWinklerSim1(),
|
|
78
|
+
JaroWinklerSim2(),
|
|
79
|
+
LevenshteinSim(),
|
|
80
|
+
Distance(),
|
|
81
|
+
Embeddings(),
|
|
53
82
|
]
|
|
54
83
|
}
|
|
55
84
|
|
|
56
85
|
|
|
57
86
|
def find_op(name):
|
|
58
|
-
return ALL_OPERANDS.get(name)
|
|
87
|
+
return deepcopy(ALL_OPERANDS.get(name))
|
upgini/autofe/binary.py
CHANGED
|
@@ -1,35 +1,40 @@
|
|
|
1
|
+
import abc
|
|
2
|
+
from typing import Optional
|
|
3
|
+
import Levenshtein
|
|
1
4
|
import numpy as np
|
|
2
5
|
import pandas as pd
|
|
3
|
-
from
|
|
4
|
-
from numpy.linalg import norm
|
|
6
|
+
from jarowinkler import jarowinkler_similarity
|
|
5
7
|
|
|
6
8
|
from upgini.autofe.operand import PandasOperand, VectorizableMixin
|
|
7
9
|
|
|
8
10
|
|
|
9
11
|
class Min(PandasOperand):
|
|
10
|
-
name = "min"
|
|
11
|
-
is_binary = True
|
|
12
|
-
|
|
12
|
+
name: str = "min"
|
|
13
|
+
is_binary: bool = True
|
|
14
|
+
is_symmetrical: bool = True
|
|
15
|
+
has_symmetry_importance: bool = True
|
|
13
16
|
|
|
14
17
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
15
18
|
return np.minimum(left, right)
|
|
16
19
|
|
|
17
20
|
|
|
18
21
|
class Max(PandasOperand):
|
|
19
|
-
name = "max"
|
|
20
|
-
is_binary = True
|
|
21
|
-
|
|
22
|
+
name: str = "max"
|
|
23
|
+
is_binary: bool = True
|
|
24
|
+
is_symmetrical: bool = True
|
|
25
|
+
has_symmetry_importance: bool = True
|
|
22
26
|
|
|
23
27
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
24
28
|
return np.maximum(left, right)
|
|
25
29
|
|
|
26
30
|
|
|
27
31
|
class Add(PandasOperand, VectorizableMixin):
|
|
28
|
-
name = "+"
|
|
29
|
-
alias = "add"
|
|
30
|
-
is_binary = True
|
|
31
|
-
|
|
32
|
-
|
|
32
|
+
name: str = "+"
|
|
33
|
+
alias: str = "add"
|
|
34
|
+
is_binary: bool = True
|
|
35
|
+
is_symmetrical: bool = True
|
|
36
|
+
has_symmetry_importance: bool = True
|
|
37
|
+
is_vectorizable: bool = True
|
|
33
38
|
|
|
34
39
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
35
40
|
return left + right
|
|
@@ -43,11 +48,12 @@ class Add(PandasOperand, VectorizableMixin):
|
|
|
43
48
|
|
|
44
49
|
|
|
45
50
|
class Subtract(PandasOperand, VectorizableMixin):
|
|
46
|
-
name = "-"
|
|
47
|
-
alias = "sub"
|
|
48
|
-
is_binary = True
|
|
49
|
-
|
|
50
|
-
|
|
51
|
+
name: str = "-"
|
|
52
|
+
alias: str = "sub"
|
|
53
|
+
is_binary: bool = True
|
|
54
|
+
is_symmetrical: bool = True
|
|
55
|
+
has_symmetry_importance: bool = True
|
|
56
|
+
is_vectorizable: bool = True
|
|
51
57
|
|
|
52
58
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
53
59
|
return left - right
|
|
@@ -61,11 +67,12 @@ class Subtract(PandasOperand, VectorizableMixin):
|
|
|
61
67
|
|
|
62
68
|
|
|
63
69
|
class Multiply(PandasOperand, VectorizableMixin):
|
|
64
|
-
name = "*"
|
|
65
|
-
alias = "mul"
|
|
66
|
-
is_binary = True
|
|
67
|
-
|
|
68
|
-
|
|
70
|
+
name: str = "*"
|
|
71
|
+
alias: str = "mul"
|
|
72
|
+
is_binary: bool = True
|
|
73
|
+
is_symmetrical: bool = True
|
|
74
|
+
has_symmetry_importance: bool = True
|
|
75
|
+
is_vectorizable: bool = True
|
|
69
76
|
|
|
70
77
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
71
78
|
return left * right
|
|
@@ -79,12 +86,12 @@ class Multiply(PandasOperand, VectorizableMixin):
|
|
|
79
86
|
|
|
80
87
|
|
|
81
88
|
class Divide(PandasOperand, VectorizableMixin):
|
|
82
|
-
name = "/"
|
|
83
|
-
alias = "div"
|
|
84
|
-
is_binary = True
|
|
85
|
-
has_symmetry_importance = True
|
|
86
|
-
is_vectorizable = True
|
|
87
|
-
output_type = "float"
|
|
89
|
+
name: str = "/"
|
|
90
|
+
alias: str = "div"
|
|
91
|
+
is_binary: bool = True
|
|
92
|
+
has_symmetry_importance: bool = True
|
|
93
|
+
is_vectorizable: bool = True
|
|
94
|
+
output_type: Optional[str] = "float"
|
|
88
95
|
|
|
89
96
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
90
97
|
return left / right.replace(0, np.nan)
|
|
@@ -98,10 +105,10 @@ class Divide(PandasOperand, VectorizableMixin):
|
|
|
98
105
|
|
|
99
106
|
|
|
100
107
|
class Combine(PandasOperand):
|
|
101
|
-
name = "Combine"
|
|
102
|
-
is_binary = True
|
|
103
|
-
has_symmetry_importance = True
|
|
104
|
-
output_type = "object"
|
|
108
|
+
name: str = "Combine"
|
|
109
|
+
is_binary: bool = True
|
|
110
|
+
has_symmetry_importance: bool = True
|
|
111
|
+
output_type: Optional[str] = "object"
|
|
105
112
|
|
|
106
113
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
107
114
|
temp = left.astype(str) + "_" + right.astype(str)
|
|
@@ -110,12 +117,13 @@ class Combine(PandasOperand):
|
|
|
110
117
|
|
|
111
118
|
|
|
112
119
|
class CombineThenFreq(PandasOperand):
|
|
113
|
-
name = "CombineThenFreq"
|
|
114
|
-
is_binary = True
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
120
|
+
name: str = "CombineThenFreq"
|
|
121
|
+
is_binary: bool = True
|
|
122
|
+
is_symmetrical: bool = True
|
|
123
|
+
has_symmetry_importance: bool = True
|
|
124
|
+
output_type: Optional[str] = "float"
|
|
125
|
+
is_distribution_dependent: bool = True
|
|
126
|
+
input_type: Optional[str] = "discrete"
|
|
119
127
|
|
|
120
128
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
121
129
|
temp = left.astype(str) + "_" + right.astype(str)
|
|
@@ -124,11 +132,106 @@ class CombineThenFreq(PandasOperand):
|
|
|
124
132
|
self._loc(temp, value_counts)
|
|
125
133
|
|
|
126
134
|
|
|
127
|
-
class
|
|
128
|
-
name = "
|
|
129
|
-
is_binary = True
|
|
130
|
-
output_type = "float"
|
|
131
|
-
|
|
135
|
+
class Distance(PandasOperand):
|
|
136
|
+
name: str = "dist"
|
|
137
|
+
is_binary: bool = True
|
|
138
|
+
output_type: Optional[str] = "float"
|
|
139
|
+
is_symmetrical: bool = True
|
|
140
|
+
has_symmetry_importance: bool = True
|
|
132
141
|
|
|
133
142
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
134
|
-
return
|
|
143
|
+
return pd.Series(
|
|
144
|
+
1 - self.__dot(left, right) / (self.__norm(left) * self.__norm(right)), index=left.index
|
|
145
|
+
).astype(np.float64)
|
|
146
|
+
|
|
147
|
+
# row-wise dot product, handling None values
|
|
148
|
+
def __dot(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
149
|
+
left = left.apply(lambda x: np.array(x))
|
|
150
|
+
right = right.apply(lambda x: np.array(x))
|
|
151
|
+
res = (left.dropna() * right.dropna()).apply(np.sum)
|
|
152
|
+
res = res.reindex(left.index.union(right.index))
|
|
153
|
+
return res
|
|
154
|
+
|
|
155
|
+
# Calculate the norm of a vector, handling None values
|
|
156
|
+
def __norm(self, vector: pd.Series) -> pd.Series:
|
|
157
|
+
vector = vector.fillna(np.nan)
|
|
158
|
+
return np.sqrt(self.__dot(vector, vector))
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
# Left for backward compatibility
|
|
162
|
+
class Sim(Distance):
|
|
163
|
+
name: str = "sim"
|
|
164
|
+
is_binary: bool = True
|
|
165
|
+
output_type: Optional[str] = "float"
|
|
166
|
+
is_symmetrical: bool = True
|
|
167
|
+
has_symmetry_importance: bool = True
|
|
168
|
+
|
|
169
|
+
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
170
|
+
return 1 - super().calculate_binary(left, right)
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
class StringSim(PandasOperand, abc.ABC):
|
|
174
|
+
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
175
|
+
sims = []
|
|
176
|
+
for i in left.index:
|
|
177
|
+
left_i = self._prepare_value(left.get(i))
|
|
178
|
+
right_i = self._prepare_value(right.get(i))
|
|
179
|
+
if left_i is not None and right_i is not None:
|
|
180
|
+
sims.append(self._similarity(left_i, right_i))
|
|
181
|
+
else:
|
|
182
|
+
sims.append(None)
|
|
183
|
+
|
|
184
|
+
return pd.Series(sims, index=left.index)
|
|
185
|
+
|
|
186
|
+
@abc.abstractmethod
|
|
187
|
+
def _prepare_value(self, value: Optional[str]) -> Optional[str]:
|
|
188
|
+
pass
|
|
189
|
+
|
|
190
|
+
@abc.abstractmethod
|
|
191
|
+
def _similarity(self, left: str, right: str) -> float:
|
|
192
|
+
pass
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
class JaroWinklerSim1(StringSim):
|
|
196
|
+
name: str = "sim_jw1"
|
|
197
|
+
is_binary: bool = True
|
|
198
|
+
input_type: Optional[str] = "string"
|
|
199
|
+
output_type: Optional[str] = "float"
|
|
200
|
+
is_symmetrical: bool = True
|
|
201
|
+
has_symmetry_importance: bool = True
|
|
202
|
+
|
|
203
|
+
def _prepare_value(self, value: Optional[str]) -> Optional[str]:
|
|
204
|
+
return value
|
|
205
|
+
|
|
206
|
+
def _similarity(self, left: str, right: str) -> float:
|
|
207
|
+
return jarowinkler_similarity(left, right)
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
class JaroWinklerSim2(StringSim):
|
|
211
|
+
name: str = "sim_jw2"
|
|
212
|
+
is_binary: bool = True
|
|
213
|
+
input_type: Optional[str] = "string"
|
|
214
|
+
output_type: Optional[str] = "float"
|
|
215
|
+
is_symmetrical: bool = True
|
|
216
|
+
has_symmetry_importance: bool = True
|
|
217
|
+
|
|
218
|
+
def _prepare_value(self, value: Optional[str]) -> Optional[str]:
|
|
219
|
+
return value[::-1] if value is not None else None
|
|
220
|
+
|
|
221
|
+
def _similarity(self, left: str, right: str) -> float:
|
|
222
|
+
return jarowinkler_similarity(left, right)
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
class LevenshteinSim(StringSim):
|
|
226
|
+
name: str = "sim_lv"
|
|
227
|
+
is_binary: bool = True
|
|
228
|
+
input_type: Optional[str] = "string"
|
|
229
|
+
output_type: Optional[str] = "float"
|
|
230
|
+
is_symmetrical: bool = True
|
|
231
|
+
has_symmetry_importance: bool = True
|
|
232
|
+
|
|
233
|
+
def _prepare_value(self, value: Optional[str]) -> Optional[str]:
|
|
234
|
+
return value
|
|
235
|
+
|
|
236
|
+
def _similarity(self, left: str, right: str) -> float:
|
|
237
|
+
return 1 - Levenshtein.distance(left, right) / max(len(left), len(right))
|
upgini/autofe/date.py
CHANGED
|
@@ -1,13 +1,20 @@
|
|
|
1
|
-
|
|
1
|
+
import abc
|
|
2
|
+
import json
|
|
3
|
+
from typing import Any, Dict, List, Optional, Union
|
|
2
4
|
|
|
3
5
|
import numpy as np
|
|
4
6
|
import pandas as pd
|
|
5
7
|
from pandas.core.arrays.timedeltas import TimedeltaArray
|
|
6
|
-
from pydantic import BaseModel
|
|
8
|
+
from pydantic import BaseModel, __version__ as pydantic_version
|
|
7
9
|
|
|
8
10
|
from upgini.autofe.operand import PandasOperand
|
|
9
11
|
|
|
10
12
|
|
|
13
|
+
def get_pydantic_version():
|
|
14
|
+
major_version = int(pydantic_version.split('.')[0])
|
|
15
|
+
return major_version
|
|
16
|
+
|
|
17
|
+
|
|
11
18
|
class DateDiffMixin(BaseModel):
|
|
12
19
|
diff_unit: str = "D"
|
|
13
20
|
left_unit: Optional[str] = None
|
|
@@ -19,34 +26,76 @@ class DateDiffMixin(BaseModel):
|
|
|
19
26
|
if isinstance(x, pd.DataFrame):
|
|
20
27
|
return x.apply(lambda y: self._convert_to_date(y, unit), axis=1)
|
|
21
28
|
|
|
22
|
-
return pd.to_datetime(x, unit=unit)
|
|
29
|
+
return pd.to_datetime(x, unit=unit, errors="coerce")
|
|
30
|
+
|
|
31
|
+
def _convert_diff_to_unit(self, diff: Union[pd.Series, TimedeltaArray]) -> Union[pd.Series, TimedeltaArray]:
|
|
32
|
+
if self.diff_unit == "D":
|
|
33
|
+
if isinstance(diff, pd.Series) and diff.dtype == "object":
|
|
34
|
+
return diff.apply(lambda x: None if isinstance(x, float) and np.isnan(x) else x.days)
|
|
35
|
+
else:
|
|
36
|
+
return diff / np.timedelta64(1, self.diff_unit)
|
|
37
|
+
elif self.diff_unit == "Y":
|
|
38
|
+
if isinstance(diff, TimedeltaArray):
|
|
39
|
+
return (diff / 365 / 24 / 60 / 60 / 10**9).astype(int)
|
|
40
|
+
else:
|
|
41
|
+
return (diff / 365 / 24 / 60 / 60 / 10**9).dt.nanoseconds
|
|
42
|
+
else:
|
|
43
|
+
raise Exception(f"Unsupported difference unit: {self.diff_unit}")
|
|
23
44
|
|
|
24
45
|
|
|
25
46
|
class DateDiff(PandasOperand, DateDiffMixin):
|
|
26
|
-
name = "date_diff"
|
|
27
|
-
|
|
28
|
-
|
|
47
|
+
name: str = "date_diff"
|
|
48
|
+
alias: Optional[str] = "date_diff_type1"
|
|
49
|
+
is_binary: bool = True
|
|
50
|
+
has_symmetry_importance: bool = True
|
|
51
|
+
|
|
52
|
+
replace_negative: bool = False
|
|
53
|
+
|
|
54
|
+
def get_params(self) -> Dict[str, Optional[str]]:
|
|
55
|
+
res = super().get_params()
|
|
56
|
+
res.update(
|
|
57
|
+
{
|
|
58
|
+
"diff_unit": self.diff_unit,
|
|
59
|
+
"left_unit": self.left_unit,
|
|
60
|
+
"right_unit": self.right_unit,
|
|
61
|
+
"replace_negative": self.replace_negative,
|
|
62
|
+
}
|
|
63
|
+
)
|
|
64
|
+
return res
|
|
29
65
|
|
|
30
66
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
31
67
|
left = self._convert_to_date(left, self.left_unit)
|
|
32
68
|
right = self._convert_to_date(right, self.right_unit)
|
|
33
|
-
|
|
69
|
+
diff = self._convert_diff_to_unit(left.dt.date - right.dt.date)
|
|
70
|
+
return self.__replace_negative(diff)
|
|
34
71
|
|
|
35
72
|
def __replace_negative(self, x: Union[pd.DataFrame, pd.Series]):
|
|
36
|
-
|
|
73
|
+
if self.replace_negative:
|
|
74
|
+
x[x < 0] = None
|
|
37
75
|
return x
|
|
38
76
|
|
|
39
77
|
|
|
40
78
|
class DateDiffType2(PandasOperand, DateDiffMixin):
|
|
41
|
-
name = "date_diff_type2"
|
|
42
|
-
is_binary = True
|
|
43
|
-
has_symmetry_importance = True
|
|
79
|
+
name: str = "date_diff_type2"
|
|
80
|
+
is_binary: bool = True
|
|
81
|
+
has_symmetry_importance: bool = True
|
|
82
|
+
|
|
83
|
+
def get_params(self) -> Dict[str, Optional[str]]:
|
|
84
|
+
res = super().get_params()
|
|
85
|
+
res.update(
|
|
86
|
+
{
|
|
87
|
+
"diff_unit": self.diff_unit,
|
|
88
|
+
"left_unit": self.left_unit,
|
|
89
|
+
"right_unit": self.right_unit,
|
|
90
|
+
}
|
|
91
|
+
)
|
|
92
|
+
return res
|
|
44
93
|
|
|
45
94
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
46
95
|
left = self._convert_to_date(left, self.left_unit)
|
|
47
96
|
right = self._convert_to_date(right, self.right_unit)
|
|
48
97
|
future = right + (left.dt.year - right.dt.year).apply(
|
|
49
|
-
lambda y:
|
|
98
|
+
lambda y: pd.tseries.offsets.DateOffset(years=0 if np.isnan(y) else y)
|
|
50
99
|
)
|
|
51
100
|
future = pd.to_datetime(future)
|
|
52
101
|
before = future[future < left]
|
|
@@ -57,12 +106,28 @@ class DateDiffType2(PandasOperand, DateDiffMixin):
|
|
|
57
106
|
|
|
58
107
|
|
|
59
108
|
_ext_aggregations = {"nunique": (lambda x: len(np.unique(x)), 0), "count": (len, 0)}
|
|
109
|
+
_count_aggregations = ["nunique", "count"]
|
|
60
110
|
|
|
61
111
|
|
|
62
112
|
class DateListDiff(PandasOperand, DateDiffMixin):
|
|
63
|
-
is_binary = True
|
|
64
|
-
has_symmetry_importance = True
|
|
113
|
+
is_binary: bool = True
|
|
114
|
+
has_symmetry_importance: bool = True
|
|
115
|
+
|
|
65
116
|
aggregation: str
|
|
117
|
+
replace_negative: bool = False
|
|
118
|
+
|
|
119
|
+
def get_params(self) -> Dict[str, Optional[str]]:
|
|
120
|
+
res = super().get_params()
|
|
121
|
+
res.update(
|
|
122
|
+
{
|
|
123
|
+
"aggregation": self.aggregation,
|
|
124
|
+
"diff_unit": self.diff_unit,
|
|
125
|
+
"left_unit": self.left_unit,
|
|
126
|
+
"right_unit": self.right_unit,
|
|
127
|
+
"replace_negative": self.replace_negative,
|
|
128
|
+
}
|
|
129
|
+
)
|
|
130
|
+
return res
|
|
66
131
|
|
|
67
132
|
def __init__(self, **data: Any) -> None:
|
|
68
133
|
if "name" not in data:
|
|
@@ -71,18 +136,28 @@ class DateListDiff(PandasOperand, DateDiffMixin):
|
|
|
71
136
|
|
|
72
137
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
73
138
|
left = self._convert_to_date(left, self.left_unit)
|
|
74
|
-
|
|
139
|
+
right_mask = right.apply(lambda x: len(x) > 0)
|
|
140
|
+
mask = left.notna() & right.notna() & right_mask
|
|
141
|
+
right_masked = right[mask].apply(lambda x: pd.arrays.DatetimeArray(self._convert_to_date(x, self.right_unit)))
|
|
142
|
+
|
|
143
|
+
if len(right_masked) == 0:
|
|
144
|
+
diff = []
|
|
145
|
+
elif len(right_masked) < 2:
|
|
146
|
+
diff = [left[mask].iloc[0] - right_masked.iloc[0]]
|
|
147
|
+
else:
|
|
148
|
+
diff = left[mask] - right_masked.values
|
|
75
149
|
|
|
76
|
-
|
|
150
|
+
res_masked = pd.Series(diff, index=left[mask].index).apply(lambda x: self._agg(self._diff(x)))
|
|
151
|
+
res = res_masked.reindex(left.index.union(right.index))
|
|
152
|
+
if self.aggregation in _count_aggregations:
|
|
153
|
+
res[~right_mask] = 0.0
|
|
154
|
+
res = res.astype(np.float64)
|
|
155
|
+
|
|
156
|
+
return res
|
|
77
157
|
|
|
78
158
|
def _diff(self, x: TimedeltaArray):
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
elif self.diff_unit == "M":
|
|
82
|
-
raise Exception("Unsupported difference unit: Month")
|
|
83
|
-
else:
|
|
84
|
-
x = x / np.timedelta64(1, self.diff_unit)
|
|
85
|
-
return x[x > 0]
|
|
159
|
+
x = self._convert_diff_to_unit(x)
|
|
160
|
+
return x[x > 0] if self.replace_negative else x
|
|
86
161
|
|
|
87
162
|
def _agg(self, x):
|
|
88
163
|
method = getattr(np, self.aggregation, None)
|
|
@@ -96,8 +171,8 @@ class DateListDiff(PandasOperand, DateDiffMixin):
|
|
|
96
171
|
|
|
97
172
|
|
|
98
173
|
class DateListDiffBounded(DateListDiff):
|
|
99
|
-
lower_bound: Optional[int]
|
|
100
|
-
upper_bound: Optional[int]
|
|
174
|
+
lower_bound: Optional[int] = None
|
|
175
|
+
upper_bound: Optional[int] = None
|
|
101
176
|
|
|
102
177
|
def __init__(self, **data: Any) -> None:
|
|
103
178
|
if "name" not in data:
|
|
@@ -114,5 +189,101 @@ class DateListDiffBounded(DateListDiff):
|
|
|
114
189
|
super().__init__(**data)
|
|
115
190
|
|
|
116
191
|
def _agg(self, x):
|
|
117
|
-
x = x[
|
|
192
|
+
x = x[
|
|
193
|
+
(x >= (self.lower_bound if self.lower_bound is not None else -np.inf))
|
|
194
|
+
& (x < (self.upper_bound if self.upper_bound is not None else np.inf))
|
|
195
|
+
]
|
|
118
196
|
return super()._agg(x)
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
class DatePercentileBase(PandasOperand, abc.ABC):
|
|
200
|
+
is_binary: bool = True
|
|
201
|
+
output_type: Optional[str] = "float"
|
|
202
|
+
|
|
203
|
+
date_unit: Optional[str] = None
|
|
204
|
+
|
|
205
|
+
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
206
|
+
# Assuming that left is a date column, right is a feature column
|
|
207
|
+
left = pd.to_datetime(left, unit=self.date_unit)
|
|
208
|
+
|
|
209
|
+
bounds = self._get_bounds(left)
|
|
210
|
+
|
|
211
|
+
return right.index.to_series().apply(lambda i: self._perc(right[i], bounds[i]))
|
|
212
|
+
|
|
213
|
+
@abc.abstractmethod
|
|
214
|
+
def _get_bounds(self, date_col: pd.Series) -> pd.Series:
|
|
215
|
+
pass
|
|
216
|
+
|
|
217
|
+
def _perc(self, f, bounds):
|
|
218
|
+
hit = np.where(f >= np.array(bounds))[0]
|
|
219
|
+
if hit.size > 0:
|
|
220
|
+
return np.max(hit) + 1
|
|
221
|
+
else:
|
|
222
|
+
return np.nan
|
|
223
|
+
|
|
224
|
+
def get_params(self) -> Dict[str, Optional[str]]:
|
|
225
|
+
res = super().get_params()
|
|
226
|
+
res.update(
|
|
227
|
+
{
|
|
228
|
+
"date_unit": self.date_unit,
|
|
229
|
+
}
|
|
230
|
+
)
|
|
231
|
+
return res
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
class DatePercentile(DatePercentileBase):
|
|
235
|
+
name: str = "date_per"
|
|
236
|
+
alias: Optional[str] = "date_per_method1"
|
|
237
|
+
|
|
238
|
+
zero_month: Optional[int] = None
|
|
239
|
+
zero_year: Optional[int] = None
|
|
240
|
+
zero_bounds: Optional[List[float]] = None
|
|
241
|
+
step: int = 30
|
|
242
|
+
|
|
243
|
+
def get_params(self) -> Dict[str, Optional[str]]:
|
|
244
|
+
res = super().get_params()
|
|
245
|
+
res.update(
|
|
246
|
+
{
|
|
247
|
+
"zero_month": self.zero_month,
|
|
248
|
+
"zero_year": self.zero_year,
|
|
249
|
+
"zero_bounds": self.zero_bounds,
|
|
250
|
+
"step": self.step,
|
|
251
|
+
}
|
|
252
|
+
)
|
|
253
|
+
return res
|
|
254
|
+
|
|
255
|
+
# Check Pydantic version
|
|
256
|
+
if get_pydantic_version() >= 2:
|
|
257
|
+
# Use @field_validator for Pydantic 2.x
|
|
258
|
+
from pydantic import field_validator
|
|
259
|
+
|
|
260
|
+
@field_validator('zero_bounds', mode='before')
|
|
261
|
+
def parse_zero_bounds(cls, value):
|
|
262
|
+
if isinstance(value, str):
|
|
263
|
+
return json.loads(value)
|
|
264
|
+
return value
|
|
265
|
+
else:
|
|
266
|
+
# Use @validator for Pydantic 1.x
|
|
267
|
+
from pydantic import validator
|
|
268
|
+
|
|
269
|
+
@validator('zero_bounds', pre=True)
|
|
270
|
+
def parse_zero_bounds(cls, value):
|
|
271
|
+
if isinstance(value, str):
|
|
272
|
+
return json.loads(value)
|
|
273
|
+
return value
|
|
274
|
+
|
|
275
|
+
def _get_bounds(self, date_col: pd.Series) -> pd.Series:
|
|
276
|
+
months = date_col.dt.month
|
|
277
|
+
years = date_col.dt.year
|
|
278
|
+
|
|
279
|
+
month_diffs = 12 * (years - (self.zero_year or 0)) + (months - (self.zero_month or 0))
|
|
280
|
+
return month_diffs.apply(
|
|
281
|
+
lambda d: np.array(self.zero_bounds if self.zero_bounds is not None else []) + d * self.step
|
|
282
|
+
)
|
|
283
|
+
|
|
284
|
+
|
|
285
|
+
class DatePercentileMethod2(DatePercentileBase):
|
|
286
|
+
name: str = "date_per_method2"
|
|
287
|
+
|
|
288
|
+
def _get_bounds(self, date_col: pd.Series) -> pd.Series:
|
|
289
|
+
pass
|